From 7af39c6955a48ea4b7e1f8f7f354377e75fe6f44 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 23 Jul 2024 11:03:55 -0700
Subject: [PATCH 01/57] Update nodejs's cmake file to fix a file copy issue
 (#21390)

This commit e5f18ba2c14ced91e5f483fde0a7ef4b3b04abbe caused some nightly
pipelines to fail. This PR fixes it.
It is because recently I changed our Linux library's SONAME. At runtime
onnxruntime_binding depends on libonnxruntime.so.1 , instead of
libonnxruntime.so.1.19.0(with the full version number). Therefore we
need to keep the libonnxruntime.so.1 symlink.

The packaging tools/ci_build/github/js/pack-npm-packages.ps1 still needs
be updated. I will address it in another PR.
---
 js/node/CMakeLists.txt | 62 ++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index 5c32f62f3a802..1ce6d66881c3e 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -59,8 +59,18 @@ endif()
 file(GLOB ORT_NODEJS_BINDING_SOURCE_FILES ${CMAKE_SOURCE_DIR}/src/*.cc)
 
 add_library(onnxruntime_binding SHARED ${ORT_NODEJS_BINDING_SOURCE_FILES} ${CMAKE_JS_SRC})
+file(MAKE_DIRECTORY ${dist_folder})
+
 set_target_properties(onnxruntime_binding PROPERTIES
   PREFIX "" SUFFIX ".node"
+  RUNTIME_OUTPUT_DIRECTORY ${dist_folder}
+  RUNTIME_OUTPUT_DIRECTORY_DEBUG ${dist_folder}
+  RUNTIME_OUTPUT_DIRECTORY_RELEASE ${dist_folder}
+  RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dist_folder}
+  LIBRARY_OUTPUT_DIRECTORY ${dist_folder}
+  LIBRARY_OUTPUT_DIRECTORY_DEBUG ${dist_folder}
+  LIBRARY_OUTPUT_DIRECTORY_RELEASE ${dist_folder}
+  LIBRARY_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dist_folder}
   BUILD_WITH_INSTALL_RPATH TRUE
   INSTALL_RPATH_USE_LINK_PATH FALSE)
 target_link_libraries(onnxruntime_binding PRIVATE ${CMAKE_JS_LIB})
@@ -86,61 +96,29 @@ else()
 endif()
 
 if (WIN32)
-  target_link_libraries(onnxruntime_binding PRIVATE onnxruntime.lib)
+  target_link_libraries(onnxruntime_binding PRIVATE onnxruntime)
 elseif (APPLE)
   target_link_libraries(onnxruntime_binding PRIVATE libonnxruntime.${ort_version}.dylib)
   set_target_properties(onnxruntime_binding PROPERTIES INSTALL_RPATH "@loader_path")
 else()
-  target_link_libraries(onnxruntime_binding PRIVATE libonnxruntime.so.${ort_version})
+  target_link_libraries(onnxruntime_binding PRIVATE onnxruntime)
   set_target_properties(onnxruntime_binding PROPERTIES INSTALL_RPATH "$ORIGIN/")
 endif()
 
-# post build
-
-add_custom_command(
-  TARGET onnxruntime_binding POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_folder}
-    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:onnxruntime_binding> ${dist_folder}
-)
 
 if (WIN32)
-  add_custom_command(
-    TARGET onnxruntime_binding POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
-      ${dist_folder}
-  )
+  file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
+      DESTINATION ${dist_folder})
   if (USE_DML)
-    add_custom_command(
-      TARGET onnxruntime_binding POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
-      ${dist_folder}
-    )
+    file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
+      DESTINATION ${dist_folder})
   endif ()
-  if (CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_custom_command(
-      TARGET onnxruntime_binding POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy
-        ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.pdb
-        ${dist_folder}
-      COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE_DIR:onnxruntime_binding>/onnxruntime_binding.pdb ${dist_folder}
-    )
-  endif()
 elseif (APPLE)
-  add_custom_command(
-    TARGET onnxruntime_binding POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.${ort_version}.dylib
-      ${dist_folder}
-  )
+  file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
+      DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
 elseif (UNIX)
-  add_custom_command(
-    TARGET onnxruntime_binding POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-      ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.so.${ort_version}
-      ${dist_folder}
-  )
+  file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.so
+      DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
 else()
   message(FATAL_ERROR "Platform not supported.")
 endif()

From 2b7e2a5bd07a882a1a1f16e81025a74745ef0394 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 23 Jul 2024 11:58:04 -0700
Subject: [PATCH 02/57] [CUDA] Fix cuda provider fallback inconsistency
 (#21425)

* Fix fallback setting (cuda still falls back to cuda).
* Fix cuda provider fallback inconsistent with/without CUDA_PATH
environment variable.
* Add cuda and cudnn major version requirement in error message.

Example result in Windows:
```
>>> import onnxruntime
>>> ort_session = onnxruntime.InferenceSession("model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
2024-07-19 17:43:44.2260019 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 onnxruntime::TryGetProviderInfo_CUDA] D:\onnxruntime\onnxruntime\core\session\provider_bridge_ort.cc:1636 onnxruntime::ProviderLibrary::Get [ONNXRuntimeError] : 1 : FAIL : LoadLibrary failed with error 126 "" when trying to load "C:\Users\.conda\envs\py310\lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"

2024-07-19 17:43:44.2312351 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:970 onnxruntime::python::CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*, and the latest MSVC runtime. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.
>>> ort_session
<onnxruntime.capi.onnxruntime_inference_collection.InferenceSession object at 0x0000016BB2DF7D60>
>>> ort_session.get_providers()
['CPUExecutionProvider']
```

Example result in Linux:
```
>>> import onnxruntime
>>> ort_session = onnxruntime.InferenceSession("resnet50-v2-7.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
2024-07-20 20:33:26.486974543 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 TryGetProviderInfo_CUDA] /work/onnxruntime/onnxruntime/core/session/provider_bridge_ort.cc:1636 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcublasLt.so.12: cannot open shared object file: No such file or directory

2024-07-20 20:33:26.487034646 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:961 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.
>>> ort_session.get_providers()
['CPUExecutionProvider']
```
### Motivation and Context
https://github.com/microsoft/onnxruntime/issues/21424
---
 cmake/onnxruntime_python.cmake                |  8 +++++--
 .../onnxruntime_inference_collection.py       | 16 +++++++++----
 .../python/onnxruntime_pybind_state.cc        | 24 ++++++++++---------
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 07c65e7986b05..270139ceaff7b 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -97,8 +97,12 @@ endif()
 
 onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy)
 target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS})
-if(onnxruntime_USE_CUDA AND onnxruntime_CUDNN_HOME)
-    target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+if(onnxruntime_USE_CUDA)
+    target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    # cudnn_home is optional for Window when cuda and cudnn are installed in the same directory.
+    if(onnxruntime_CUDNN_HOME)
+      target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+    endif()
 endif()
 if(onnxruntime_USE_CANN)
     target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include)
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index ecae280e92ae5..c3cfe2c97ae95 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -438,10 +438,18 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
 
         # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
         if "TensorrtExecutionProvider" in available_providers:
-            if providers and any(
-                provider == "CUDAExecutionProvider"
-                or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
-                for provider in providers
+            if (
+                providers
+                and any(
+                    provider == "CUDAExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider")
+                    for provider in providers
+                )
+                and any(
+                    provider == "TensorrtExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                    for provider in providers
+                )
             ):
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index d7155b2b6899a..6b5daf8cb882b 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -35,6 +35,11 @@
 #include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
 #endif
 
+#ifdef USE_CUDA
+#include <cuda.h>   // for CUDA_VERSION
+#include <cudnn.h>  // for CUDNN_MAJOR
+#endif
+
 #include <pybind11/functional.h>
 
 // Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct,
@@ -951,21 +956,18 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         // external CUDA allocator.
         external_allocator_info = info.external_allocator_info;
         return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
-      } else {
-        if (!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
-          ORT_THROW(
-              "CUDA_PATH is set but CUDA wasnt able to be loaded. Please install the correct version of CUDA and"
-              "cuDNN as mentioned in the GPU requirements page "
-              " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), "
-              " make sure they're in the PATH, and that your GPU is supported.");
-        }
       }
     }
     LOGS_DEFAULT(WARNING) << "Failed to create "
                           << type
-                          << ". Please reference "
-                          << "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements"
-                          << "to ensure all dependencies are met.";
+                          << ". Require cuDNN " << CUDNN_MAJOR << ".* and "
+                          << "CUDA " << (CUDA_VERSION / 1000) << ".*"
+#if defined(_MSC_VER)
+                          << ", and the latest MSVC runtime"
+#endif
+                          << ". Please install all dependencies as mentioned in the GPU requirements page"
+                             " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), "
+                             "make sure they're in the PATH, and that your GPU is supported.";
 #endif
   } else if (type == kRocmExecutionProvider) {
 #ifdef USE_ROCM

From c65afcea551ad96e9247754d28914bc50c1eefca Mon Sep 17 00:00:00 2001
From: George Wu <jywu@microsoft.com>
Date: Tue, 23 Jul 2024 15:54:44 -0700
Subject: [PATCH 03/57] fix python qnn pipelines issues (#21462)

build_py_params wasn't plumbed through for python qnn pipelines.
incorporate fixes for deprecated numpy version option from
https://github.com/microsoft/onnxruntime/pull/21459
---
 .../azure-pipelines/templates/py-packaging-stage.yml     | 3 ++-
 .../azure-pipelines/templates/py-win-arm64-qnn.yml       | 9 ++-------
 .../github/azure-pipelines/templates/py-win-x64-qnn.yml  | 2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 27f85dc5c1648..17e64a207be2f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -510,7 +510,7 @@ stages:
           MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64'
           QNN_SDK: ${{ parameters.qnn_sdk_version }}
           PYTHON_VERSION: '3.11'
-          NUMPY_VERSION: '1.26.4'
+          BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
 
   - ${{ if eq(parameters.enable_windows_x64_qnn, true) }}:
     - stage: Python_Packaging_Windows_x64_QNN
@@ -520,3 +520,4 @@ stages:
           parameters:
             MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU'
             QNN_SDK: ${{ parameters.qnn_sdk_version }}
+            BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index af239b4384af9..70221976d978f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -13,10 +13,6 @@ parameters:
   type: string
   default: '3.11'
 
-- name: NUMPY_VERSION
-  type: string
-  default: '1.26.4'
-
 - name: ENV_SETUP_SCRIPT
   type: string
   default: ''
@@ -70,7 +66,7 @@ jobs:
           scriptSource: inline
           script: |
             import subprocess
-            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'numpy==${{parameters.NUMPY_VERSION}}'])
+            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel'])
           workingDirectory: '$(Build.BinariesDirectory)'
           displayName: 'Install python modules'
 
@@ -93,7 +89,6 @@ jobs:
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
             --parallel --update
-            --numpy_version ${{ parameters.NUMPY_VERSION }}
             $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
           workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -121,7 +116,7 @@ jobs:
         displayName: 'Build wheel'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn'
+          arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
           workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
 
       - task: CopyFiles@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 884e6eafee965..1bf5db5ae6d9a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -122,7 +122,7 @@ jobs:
         displayName: 'Build wheel'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn'
+          arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn'
           workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
 
       - task: CopyFiles@2

From 86cedc68326a4680aed43a291b03ac15734652ba Mon Sep 17 00:00:00 2001
From: mingyueliuh <131847423+mingyueliuh@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:51:00 -0400
Subject: [PATCH 04/57] [Fix] C++ API SetOutputShape for register custom op.
 (#21366)

### Description
Bug fix  for the SetOutputShape method in custom op shape inference.


### Motivation and Context

- Bug a :   A obvious bug that will cause all dimensions to be 1.

https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_inline.h#L2014
integer_dims.push_back(dim.IsInt()); ->
integer_dims.push_back(dim.AsInt());

- Bug b :  vector out of range error
op's input maybe a scalar and shape is empty.

https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_inline.h#L1985

---------

Co-authored-by: mingyue <mingyue@amd.com>
---
 include/onnxruntime/core/session/onnxruntime_cxx_inline.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index a732bf169dc7a..aaef111b9f15b 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1982,7 +1982,9 @@ inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api,
     TensorTypeAndShapeInfo type_shape_info(info);
     auto integer_shape = type_shape_info.GetShape();
     std::vector<const char*> symbolic_shape(integer_shape.size(), {});
-    type_shape_info.GetSymbolicDimensions(&symbolic_shape[0], integer_shape.size());
+    if (!integer_shape.empty()) {
+      type_shape_info.GetSymbolicDimensions(&symbolic_shape[0], integer_shape.size());
+    }
     Shape shape;
     for (size_t ith = 0; ith < integer_shape.size(); ++ith) {
       if (symbolic_shape[ith] && std::string{symbolic_shape[ith]}.size() > 0) {
@@ -2011,7 +2013,7 @@ inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shap
 
   for (const auto dim : shape) {
     if (dim.IsInt()) {
-      integer_dims.push_back(dim.IsInt());
+      integer_dims.push_back(dim.AsInt());
       symbolic_dims.push_back("");
     } else {
       if (!dim.AsSym() || std::string{dim.AsSym()}.empty()) {

From 1df9aa2f080a66e2d40b176623c3ec6add87b9f8 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 24 Jul 2024 11:04:48 +1000
Subject: [PATCH 05/57] CoreML: Add GridSample ML Program support (#21431)

### Description
<!-- Describe your changes. -->
Add GridSample ML Program support

One combination of inputs has diffs between the pytorch generated unit
tests data and CoreML. Disabling until needed as investigation may take
a while.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
High priorities models
---
 .../builders/impl/gridsample_op_builder.cc    | 132 ++++++++++++++++++
 .../coreml/builders/op_builder_factory.cc     |   2 +
 .../coreml/builders/op_builder_factory.h      |   1 +
 .../providers/cpu/tensor/grid_sample_test.cc  | 101 +++++++-------
 .../cpu/tensor/grid_sample_test_gen.py        |   2 +-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 6 files changed, 190 insertions(+), 49 deletions(-)
 create mode 100644 onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc

diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
new file mode 100644
index 0000000000000..bfc665e0ac716
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
@@ -0,0 +1,132 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+namespace {
+std::string_view GetMode(const NodeAttrHelper& helper) {
+  // opset 16 used bilinear, nearest, bicubic
+  // opset 20+ uses linear, nearest, cubic
+  // bilinear is what CoreML uses, so prefer that
+  // bicubic/cubic isn't supported
+
+  const auto& mode = helper.Get("mode", "linear");
+  if (mode == "linear") {
+    return "bilinear";
+  }
+
+  return mode;
+}
+}  // namespace
+
+class GridSampleOpBuilder : public BaseOpBuilder {
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+};
+
+Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
+                                                  [[maybe_unused]] const Node& node,
+                                                  [[maybe_unused]] const logging::Logger& logger) const {
+#if defined(COREML_ENABLE_MLPROGRAM)
+  using namespace CoreML::Specification::MILSpec;  // NOLINT
+  // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample
+
+  const auto input_defs = node.InputDefs();
+  const auto output_defs = node.OutputDefs();
+
+  NodeAttrHelper helper(node);
+  std::string mode{GetMode(helper)};  //  need a std::string for use in AddScalarConstant
+  std::string padding_mode = helper.Get("padding_mode", "zeros");
+  const bool align_corners = helper.Get("align_corners", 0);
+  const std::string coordinates_mode = "normalized_minus_one_to_one";
+
+  // adjust to coreml equivalents
+  if (padding_mode == "zeros") {
+    padding_mode = "constant";
+  }
+
+  auto op = model_builder.CreateOperation(node, "resample");
+  AddOperationInput(*op, "x", input_defs[0]->Name());
+  AddOperationInput(*op, "coordinates", input_defs[1]->Name());
+  AddOperationInput(*op, "sampling_mode", model_builder.AddScalarConstant(op->type(), "sampling_mode", mode));
+  AddOperationInput(*op, "padding_mode", model_builder.AddScalarConstant(op->type(), "padding_mode", padding_mode));
+  AddOperationInput(*op, "padding_value", model_builder.AddScalarConstant(op->type(), "padding_value", 0.0f));
+  AddOperationInput(*op, "coordinates_mode",
+                    model_builder.AddScalarConstant(op->type(), "coordinates_mode", coordinates_mode));
+  AddOperationInput(*op, "align_corners", model_builder.AddScalarConstant(op->type(), "align_corners", align_corners));
+
+  AddOperationOutput(*op, *output_defs[0]);
+
+  model_builder.AddOperation(std::move(op));
+#endif
+  return Status::OK();
+}
+
+bool GridSampleOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                            const logging::Logger& logger) const {
+  if (!input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "GridSample is not supported.";
+    return false;
+  }
+
+  const auto& input_defs = node.InputDefs();
+
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "GridSample: failed to get input shape";
+    return false;
+  }
+
+  const auto input_rank = input_shape.size();
+  if (input_rank != 4) {
+    LOGS(logger, VERBOSE) << "GridSample only supports 4D input. Got:" << input_rank << "D";
+    return false;
+  }
+
+  NodeAttrHelper helper(node);
+  std::string_view mode = GetMode(helper);
+
+  if (mode != "bilinear" && mode != "zeros") {
+    LOGS(logger, VERBOSE) << "GridSample does not support mode of " << mode;
+    return false;
+  }
+
+  // there is one combination of settings where the unit test fails.
+  // The ORT unit test values are generated by pytorch so not clear if it's an issue with CoreML.
+  // CoreML output is consistent for CPU and non-CPU at least.
+  // Disabling until there's a use-case that requires this combination.
+  const auto& padding_mode = helper.Get("padding_mode", "zeros");
+  const bool align_corners = helper.Get("align_corners", 0);
+
+  if (mode == "bilinear" && padding_mode == "reflection" && align_corners == false) {
+    LOGS(logger, VERBOSE) << "GridSample does not support mode:" << mode << " padding_mode:" << padding_mode
+                          << " align_corners:" << align_corners
+                          << " currently due to output diffs that need to be investigated";
+    return false;
+  }
+
+  return true;
+}
+
+void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<GridSampleOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index 2c06659852134..b17827f8e0532 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -130,6 +130,8 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateSplitOpBuilder("Split", op_registrations);
   }
 
+  CreateGridSampleOpBuilder("GridSample", op_registrations);
+
   return op_registrations;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index 6469b4cefa5ea..a9a8ab90b0863 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -28,6 +28,7 @@ void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrati
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 5c89d6ea7bd75..540dc6dee68fb 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -13,6 +13,7 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
+
 #ifdef USE_CUDA
   if (opset_version < 20) {
     execution_providers.emplace_back(DefaultCudaExecutionProvider());
@@ -20,8 +21,12 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
     execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
 #endif
   }
+#endif
 
+#if defined(USE_COREML)
+  execution_providers.push_back(DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true));
 #endif
+
   return execution_providers;
 }
 
@@ -35,7 +40,7 @@ void RunTests(T& test, std::vector<std::unique_ptr<IExecutionProvider>>&& execut
 
 // DO NOT edit following tests. They are generated by:
 // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -55,7 +60,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -75,7 +80,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -95,7 +100,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -115,7 +120,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -135,7 +140,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -155,7 +160,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners)
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "zeros";
@@ -175,7 +180,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "zeros";
@@ -195,7 +200,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "border";
@@ -215,7 +220,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "border";
@@ -235,7 +240,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "reflection";
@@ -255,7 +260,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bilinear";
   std::string padding_mode = "reflection";
@@ -275,7 +280,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "zeros";
@@ -295,7 +300,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "zeros";
@@ -315,7 +320,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "border";
@@ -335,7 +340,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "border";
@@ -355,7 +360,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "reflection";
@@ -375,7 +380,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) {
   OpTester test("GridSample", 16);
   std::string mode = "bicubic";
   std::string padding_mode = "reflection";
@@ -395,7 +400,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners)
   RunTests(test, GetExecutionProviders(16));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -415,7 +420,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -435,7 +440,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -455,7 +460,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "zeros";
@@ -475,7 +480,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -495,7 +500,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -515,7 +520,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -535,7 +540,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "border";
@@ -555,7 +560,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -575,7 +580,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -595,7 +600,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -615,7 +620,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners)
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "nearest";
   std::string padding_mode = "reflection";
@@ -635,7 +640,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners)
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "zeros";
@@ -655,7 +660,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "zeros";
@@ -675,7 +680,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "zeros";
@@ -695,7 +700,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "zeros";
@@ -715,7 +720,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "border";
@@ -735,7 +740,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "border";
@@ -755,7 +760,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "border";
@@ -775,7 +780,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "border";
@@ -795,7 +800,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "reflection";
@@ -815,7 +820,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "reflection";
@@ -835,7 +840,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "reflection";
@@ -855,7 +860,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "linear";
   std::string padding_mode = "reflection";
@@ -875,7 +880,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "zeros";
@@ -895,7 +900,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "zeros";
@@ -915,7 +920,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "border";
@@ -935,7 +940,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "border";
@@ -955,7 +960,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "reflection";
@@ -975,7 +980,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
   RunTests(test, GetExecutionProviders(20));
 }
 
-TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) {
+TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) {
   OpTester test("GridSample", 20);
   std::string mode = "cubic";
   std::string padding_mode = "reflection";
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
index c60e55617774f..c7e263ca3f654 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
@@ -58,7 +58,7 @@
                     onnx_align_corners = 1 if align_corners else 0
 
                     test_name = f"test_grid_sample_{opset_version}_{ndim}D_{mode}_{padding_mode}_{'align_corners' if align_corners else 'no_align_corners'}"
-                    print(f"TEST(GridsampleTest, {test_name}) {{")
+                    print(f"TEST(GridSampleTest, {test_name}) {{")
                     print(f'OpTester test("GridSample", {opset_version});')
                     print(f'std::string mode = "{onnx_mode}";')
                     print(f'std::string padding_mode = "{padding_mode}";')
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index c33184686c932..b65b0f64686a9 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -11,6 +11,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:GlobalMaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
+|ai.onnx:GridSample|4D input.<br/>'mode' of 'linear' or 'zeros'.<br/>(mode==linear && padding_mode==reflection && align_corners==0) is not supported.|
 |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.|
 |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Mul||

From 0274008b6baa89a907527169a888efcd58646f5b Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 24 Jul 2024 09:51:49 +0800
Subject: [PATCH 06/57] [WebNN EP] ConvTranspose should calculate the pads or
 output shape (#21292)

This PR adds the missing pads and output shape calculation for
ConvTranspose.

Per ONNX spec:
- If the output shape is explicitly provided, compute the pads.
- Otherwise compute the output shape, as well as the pads if the
auto_pad attribute is SAME_UPPER/SAME_LOWER.
---
 .../webnn/builders/impl/builder_utils.cc      | 88 +++++++++++++++++++
 .../webnn/builders/impl/builder_utils.h       | 13 +++
 .../webnn/builders/impl/conv_op_builder.cc    | 81 ++++++-----------
 3 files changed, 126 insertions(+), 56 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
index d147ffbbd181f..113cc3df5438d 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
@@ -68,5 +68,93 @@ common::Status HandleAutoPad(const std::vector<int64_t> input_shape,
   return Status::OK();
 }
 
+common::Status ComputeConvTransposePadAndOutputShape(
+    const int64_t in_size,
+    const int64_t stride,
+    const int64_t kernel,
+    const int64_t dilation,
+    const int64_t adj,
+    AutoPadType pad_type,
+    int64_t& pad_head,
+    int64_t& pad_tail,
+    int64_t& out_size) {
+  // Output shape is explicitly provided - pad values will have to be computed.
+  if (out_size != -1) {
+    // total pad
+    auto total_pad = ComputeTotalPad(in_size, stride, adj, kernel, dilation, out_size);
+    DistributePadding(pad_type, total_pad, pad_head, pad_tail);
+    return Status::OK();
+  }
+
+  // Output shape is not provided - it needs to be computed along with pad values (if applicable).
+
+  // Compute padding if the auto_pad attribute is SAME_UPPER/SAME_LOWER.
+  if (pad_type == AutoPadType::SAME_UPPER || pad_type == AutoPadType::SAME_LOWER) {
+    // The ONNX spec says if `auto_pad` attribute is set, pad until the `out_size`
+    // is `in_size * stride`.
+    auto total_pad = ComputeTotalPad(in_size, stride, adj,
+                                     kernel, dilation, /*out_size = */ in_size * stride);
+    DistributePadding(pad_type, total_pad, pad_head, pad_tail);
+  }
+
+  out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - pad_head - pad_tail;
+
+  return Status::OK();
+}
+
+common::Status ComputeConvTransposePadsAndOutputShape(const std::vector<int64_t> input_shape,
+                                                      const int64_t weight_size_y,
+                                                      const int64_t weight_size_x,
+                                                      const std::vector<int64_t>& onnx_pads,
+                                                      const std::vector<int64_t>& onnx_strides,
+                                                      const std::vector<int64_t>& onnx_dilations,
+                                                      const std::vector<int64_t>& onnx_output_padding,
+                                                      AutoPadType auto_pad_type,
+                                                      std::vector<int64_t>& pads_out,
+                                                      std::vector<int64_t>& output_shape_out,
+                                                      bool use_nchw) {
+  const int64_t input_size_y = use_nchw ? input_shape[2] : input_shape[1];
+  const int64_t input_size_x = use_nchw ? input_shape[3] : input_shape[2];
+  const int64_t stride_y = onnx_strides[0];
+  const int64_t stride_x = onnx_strides[1];
+  const int64_t dilation_y = onnx_dilations[0];
+  const int64_t dilation_x = onnx_dilations[1];
+  const int64_t output_padding_y = onnx_output_padding[0];
+  const int64_t output_padding_x = onnx_output_padding[1];
+
+  int64_t padding_top = onnx_pads[0];
+  int64_t padding_bottom = onnx_pads[2];
+  int64_t padding_left = onnx_pads[1];
+  int64_t padding_right = onnx_pads[3];
+  int64_t output_shape_out_y = output_shape_out[0];
+  int64_t output_shape_out_x = output_shape_out[1];
+  ORT_RETURN_IF_ERROR(ComputeConvTransposePadAndOutputShape(
+      input_size_y,
+      stride_y,
+      weight_size_y,
+      dilation_y,
+      output_padding_y,
+      auto_pad_type,
+      padding_top,
+      padding_bottom,
+      output_shape_out_y));
+  ORT_RETURN_IF_ERROR(ComputeConvTransposePadAndOutputShape(
+      input_size_x,
+      stride_x,
+      weight_size_x,
+      dilation_x,
+      output_padding_x,
+      auto_pad_type,
+      padding_left,
+      padding_right,
+      output_shape_out_x));
+
+  // WebNN only needs the height and width of the output shape.
+  output_shape_out = {output_shape_out_y, output_shape_out_x};
+  pads_out = {padding_top, padding_left, padding_bottom, padding_right};
+
+  return Status::OK();
+}
+
 }  // namespace webnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h
index cb7c3c6955664..5a156c96c4852 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h
@@ -24,5 +24,18 @@ common::Status HandleAutoPad(const std::vector<int64_t> input_shape,
                              std::vector<int64_t>& pads_out,
                              bool use_nchw) ORT_MUST_USE_RESULT;
 
+// Compute pads and output shape for ConvTranspose.
+common::Status ComputeConvTransposePadsAndOutputShape(const std::vector<int64_t> input_shape,
+                                                      const int64_t weight_size_y,
+                                                      const int64_t weight_size_x,
+                                                      const std::vector<int64_t>& onnx_pads,
+                                                      const std::vector<int64_t>& onnx_strides,
+                                                      const std::vector<int64_t>& onnx_dilations,
+                                                      const std::vector<int64_t>& onnx_output_padding,
+                                                      AutoPadType auto_pad_type,
+                                                      std::vector<int64_t>& pads_out,
+                                                      std::vector<int64_t>& output_shape_out,
+                                                      bool use_nchw) ORT_MUST_USE_RESULT;
+
 }  // namespace webnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 847db6a9975c6..320aaa03930fd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -56,72 +56,41 @@ common::Status SetConvBaseOptions(ModelBuilder& model_builder,
 
   // Add Padding.
   AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
-  if (node.OpType() == "Conv") {
+  std::vector<int64_t> pads_out;
+  if (node.OpType() == "Conv" || node.OpType() == "ConvInteger") {
     // Calculate explicit padding for autoPad.
     if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-      std::vector<int64_t> pads_out;
       ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, weight_shape[2], weight_shape[3],
                                         pads, strides, dilations, auto_pad_type, pads_out, !is_nhwc));
       pads = pads_out;
     }
   } else if (node.OpType() == "ConvTranspose") {
-    // When the 'output_shape' is specificed, the 'output_padding' values
-    // in options.outputPadding are ignored.
-    std::vector<int64_t> dims;
-    std::vector<int64_t> output_padding{0, 0};
-    if (helper.HasAttr("output_shape")) {
-      // Default value of 'output_shape' will be ignored as we already check if it existed.
-      dims = helper.Get("output_shape", std::vector<int64_t>{-1, -1});
-      // Extract the height and width.
-      std::vector<int64_t> output_shape;
-      if (dims.size() == 1 && is_conv1d) {  // ConvTranspose 1d
-        output_shape = {dims[0], 1};
-      } else if (dims.size() == 2 && !is_conv1d) {
-        output_shape = dims;
-      } else {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid output shape");
-      }
-      // Padding values are auto generated.
-      if (helper.HasAttr("kernel_shape")) {
-        std::vector<int64_t> kernel_shape = helper.Get("kernel_shape", std::vector<int64_t>{-1, -1});
-        if (is_conv1d) {  // ConvTranspose 1d
-          kernel_shape.push_back(1);
-        }
-        std::vector<int64_t> total_padding(2);
-        for (size_t i = 0; i < 2; i++) {
-          // Get the dimensions of H and W.
-          // For NHWC layout, the dimensions of H and W correspond to index 1 and 2.
-          // For NCHW layout, the dimensions of H and W correspond to index 2 and 3.
-          if (is_nhwc) {
-            total_padding[i] = strides[i] * (input_shape[i + 1] - 1) + output_padding[i] +
-                               ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
-          } else {
-            total_padding[i] = strides[i] * (input_shape[i + 2] - 1) + output_padding[i] +
-                               ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i];
-          }
-        }
-        AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
-        if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) {
-          pads[0] = total_padding[0] / 2;
-          pads[1] = total_padding[0] - pads[0];
-          pads[2] = total_padding[1] / 2;
-          pads[3] = total_padding[1] - pads[2];
-          if (AutoPadType::SAME_LOWER == auto_pad_type) {
-            std::swap(pads[0], pads[1]);
-            std::swap(pads[2], pads[3]);
-          }
-        }
-      }
+    std::vector<int64_t> output_shape = helper.Get("output_shape", std::vector<int64_t>{-1, -1});
+    // Appending 1's if it is ConvTranspose 1d and output shape is provided.
+    if (output_shape.size() == 1 && is_conv1d && output_shape[0] != -1) {
+      output_shape.push_back(1);
+    }
+
+    std::vector<int64_t> output_padding = helper.Get("output_padding", std::vector<int64_t>{0, 0});
+    // Appending 0's if it is ConvTranspose 1d.
+    if (output_padding.size() == 1 && is_conv1d) {
+      output_padding.push_back(0);
+    }
+    options.set("outputPadding", emscripten::val::array(GetVecUint32FromVecInt64(output_padding)));
+
+    // If output shape is explicitly provided, compute the pads.
+    // Otherwise compute the output shape, as well as the pads if the auto_pad attribute is SAME_UPPER/SAME_LOWER.
+    ORT_RETURN_IF_ERROR(ComputeConvTransposePadsAndOutputShape(input_shape, weight_shape[2], weight_shape[3],
+                                                               pads, strides, dilations, output_padding,
+                                                               auto_pad_type, pads_out, output_shape, !is_nhwc));
+
+    if (output_shape[0] != -1 && output_shape[1] != -1) {
       options.set("outputSizes", emscripten::val::array(GetVecUint32FromVecInt64(output_shape)));
-    } else {
-      output_padding = helper.Get("output_padding", std::vector<int64_t>{0, 0});
-      if (output_padding.size() == 1 && is_conv1d) {  // ConvTranspose 1d
-        output_padding.push_back(0);
-      }
-      options.set("outputPadding", emscripten::val::array(GetVecUint32FromVecInt64(output_padding)));
     }
+    pads = pads_out;
   } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "conv_op_builder only supports Op Conv and ConvTranspose.");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "conv_op_builder only supports Op Conv, ConvInteger and ConvTranspose.");
   }
 
   const auto group = helper.Get("group", static_cast<uint32_t>(1));

From 6794dfd9412ec66977aa4ee01b7ea01a6dfa7296 Mon Sep 17 00:00:00 2001
From: Chester Liu <4710575+skyline75489@users.noreply.github.com>
Date: Wed, 24 Jul 2024 13:41:09 +0800
Subject: [PATCH 07/57] [QNN EP] Improve QNN error reporting using the error
 message (#21458)

### Description

Massively improve the QNN error reporting by invoking
`QnnError_getMessage` and returning the error message.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Example error message before this change:

```text
QNN SetupBackend failed Failed to create device. Error: 14001
```

After:

```text
QNN SetupBackend failed Failed to create device. Error: QNN_DEVICE_ERROR_INVALID_CONFIG: Invalid config values
```
---
 .../qnn/builder/qnn_backend_manager.cc        | 55 +++++++++++--------
 .../qnn/builder/qnn_backend_manager.h         |  1 +
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index f44efb1eba6db..0005869f13f66 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -279,7 +279,7 @@ Status QnnBackendManager::InitializeQnnLog() {
     }
   }
 
-  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize logging in the QNN backend");
+  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize logging in the QNN backend. Error: ", QnnErrorHandleToString(result));
   return Status::OK();
 }
 
@@ -320,7 +320,7 @@ Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) {
       LOGS(*logger_, ERROR) << "Invalid log handle provided to QnnLog_setLogLevel.";
     }
   }
-  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend");
+  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result));
   return Status::OK();
 }
 
@@ -330,8 +330,8 @@ Status QnnBackendManager::InitializeBackend() {
     return Status::OK();
   }
 
-  auto result = qnn_interface_.backendCreate(log_handle_, (const QnnBackend_Config_t**)backend_config_, &backend_handle_);
-  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize backend");
+  Qnn_ErrorHandle_t result = qnn_interface_.backendCreate(log_handle_, (const QnnBackend_Config_t**)backend_config_, &backend_handle_);
+  ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize backend. Error: ", QnnErrorHandleToString(result));
 
   backend_initialized_ = true;
   return Status::OK();
@@ -406,9 +406,9 @@ Status QnnBackendManager::CreateDevice() {
 
   LOGS_DEFAULT(INFO) << "Create device.";
   if (nullptr != qnn_interface_.deviceCreate) {
-    auto result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_);
+    Qnn_ErrorHandle_t result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_);
     if (QNN_SUCCESS != result) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", result);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", QnnErrorHandleToString(result));
     }
   }
   device_created_ = true;
@@ -422,9 +422,9 @@ Status QnnBackendManager::ReleaseDevice() {
   }
 
   if (nullptr != qnn_interface_.deviceFree) {
-    auto result = qnn_interface_.deviceFree(device_handle_);
+    Qnn_ErrorHandle_t result = qnn_interface_.deviceFree(device_handle_);
     if (QNN_SUCCESS != result) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to release device. Error: ", result);
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to release device. Error: ", QnnErrorHandleToString(result));
     }
   }
 
@@ -451,8 +451,8 @@ Status QnnBackendManager::InitializeProfiling() {
   } else if (ProfilingLevel::DETAILED == profiling_level_merge_) {
     qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
   }
-  auto result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
-  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile!");
+  Qnn_ErrorHandle_t result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
+  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile! Error: ", QnnErrorHandleToString(result));
 
   return Status::OK();
 }
@@ -525,13 +525,13 @@ Status QnnBackendManager::CreateContext() {
   const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr};
 
   Qnn_ContextHandle_t context = nullptr;
-  auto result = qnn_interface_.contextCreate(backend_handle_,
-                                             device_handle_,
-                                             context_configs,
-                                             &context);
+  Qnn_ErrorHandle_t result = qnn_interface_.contextCreate(backend_handle_,
+                                                          device_handle_,
+                                                          context_configs,
+                                                          &context);
   contexts_.push_back(context);
 
-  ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context.");
+  ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context. Error: ", QnnErrorHandleToString(result));
 
   context_created_ = true;
   return Status::OK();
@@ -544,7 +544,7 @@ Status QnnBackendManager::ReleaseContext() {
 
   bool failed = false;
   for (auto context : contexts_) {
-    auto result = qnn_interface_.contextFree(context, nullptr);
+    Qnn_ErrorHandle_t result = qnn_interface_.contextFree(context, nullptr);
     if (QNN_CONTEXT_NO_ERROR != result) {
       failed = true;
     }
@@ -566,7 +566,7 @@ std::unique_ptr<unsigned char[]> QnnBackendManager::GetContextBinaryBuffer(uint6
   // Generate all graphs in one single context
   Qnn_ErrorHandle_t rt = qnn_interface_.contextGetBinarySize(contexts_[0], &required_buffer_size);
   if (QNN_CONTEXT_NO_ERROR != rt) {
-    LOGS(*logger_, ERROR) << "Failed to get QNN context binary size. Error code: " << rt;
+    LOGS(*logger_, ERROR) << "Failed to get QNN context binary size. Error: " << QnnErrorHandleToString(rt);
     return nullptr;
   }
 
@@ -581,7 +581,7 @@ std::unique_ptr<unsigned char[]> QnnBackendManager::GetContextBinaryBuffer(uint6
                                        required_buffer_size,
                                        &written_buffer_size);
   if (QNN_CONTEXT_NO_ERROR != rt) {
-    LOGS(*logger_, ERROR) << "Failed to get context binary.";
+    LOGS(*logger_, ERROR) << "Failed to get context binary. Error: " << QnnErrorHandleToString(rt);
     return nullptr;
   }
 
@@ -1014,8 +1014,8 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
 
   const QnnProfile_EventId_t* profile_events{nullptr};
   uint32_t num_events{0};
-  auto result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events);
-  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events.");
+  Qnn_ErrorHandle_t result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events);
+  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result));
 
   if (num_events > 0) {
     LOGS(*logger_, VERBOSE) << "profile_events: " << profile_events << " num_events: " << num_events;
@@ -1073,8 +1073,8 @@ Status QnnBackendManager::ExtractProfilingSubEvents(
     bool tracelogging_provider_ep_enabled) {
   const QnnProfile_EventId_t* profile_sub_events{nullptr};
   uint32_t num_sub_events{0};
-  auto result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events);
-  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile sub events.");
+  Qnn_ErrorHandle_t result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events);
+  ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile sub events. Error: ", QnnErrorHandleToString(result));
 
   if (num_sub_events > 0) {
     LOGS(*logger_, VERBOSE) << "profile_sub_events: " << profile_sub_events << " num_sub_events: " << num_sub_events;
@@ -1113,7 +1113,7 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
     std::ofstream& outfile,
     bool tracelogging_provider_ep_enabled) {
   QnnProfile_EventData_t event_data;
-  auto result = qnn_interface_.profileGetEventData(profile_event_id, &event_data);
+  Qnn_ErrorHandle_t result = qnn_interface_.profileGetEventData(profile_event_id, &event_data);
   QnnProfile_Error_t errorCode = static_cast<QnnProfile_Error_t>(result & 0xFFFF);
   ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile event data: " + std::string(QnnProfileErrorToString(errorCode)));
 
@@ -1293,6 +1293,15 @@ const char* QnnBackendManager::QnnProfileErrorToString(QnnProfile_Error_t error)
   }
 }
 
+const char* QnnBackendManager::QnnErrorHandleToString(Qnn_ErrorHandle_t error) {
+  // From QNN SDK: The memory is statically owned and should not be freed by the caller.
+  const char* error_msg = nullptr;
+  if (QNN_SUCCESS == qnn_interface_.errorGetMessage(error, &error_msg)) {
+    return error_msg;
+  }
+  return "Unknown";
+}
+
 const std::string QnnBackendManager::ExtractQnnScalarValue(const Qnn_Scalar_t& scalar) {
   switch (scalar.dataType) {
     case QNN_DATATYPE_INT_8:
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index d51e547aeb2fb..a4811b2cb6db3 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -216,6 +216,7 @@ class QnnBackendManager {
   static const std::string GetEventTypeString(QnnProfile_EventType_t eventType);
   static const std::string ExtractQnnScalarValue(const Qnn_Scalar_t& scalar);
   const char* QnnProfileErrorToString(QnnProfile_Error_t error);
+  const char* QnnErrorHandleToString(Qnn_ErrorHandle_t error);
   QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level);
 #ifdef _WIN32
   void LogQnnProfileEventAsTraceLogging(

From 2580d935cbecd756cef435fb173a2f10237e9d2a Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 24 Jul 2024 16:08:20 +1000
Subject: [PATCH 08/57] CoreML: Add ML Program ConvTranspose (#21416)

### Description
<!-- Describe your changes. -->
Add ML Program ConvTranspose
- some limitations to simplify the implementation for now
- some limitations due to flaky CoreML output

Added support for non-contiguous MLMultiArray output as we see that with
some unit tests when the CPU-only flag is not set (e.g. innermost dim
has min size of 16 but test output only has 8 values).
- support only one non-contiguous dim to keep it simple
- manually tested as we don't have a setup that can test objective-c
code
- test code is in model.mm and can be enabled via ifdef if we need to
validate any future changes


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Address operator gaps in high priority model.

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 cmake/onnxruntime_unittests.cmake             |   5 +-
 .../builders/impl/convtranspose_op_builder.cc | 218 ++++++++++++++++++
 .../coreml/builders/impl/resize_op_builder.cc |   4 +-
 .../coreml/builders/op_builder_factory.cc     | 164 ++++---------
 .../coreml/builders/op_builder_factory.h      |   1 +
 .../core/providers/coreml/model/model.h       |  13 ++
 .../core/providers/coreml/model/model.mm      | 140 ++++++++---
 .../builders/impl/resize_op_builder.cc        |   4 +-
 onnxruntime/core/providers/utils.cc           |   2 +-
 onnxruntime/core/providers/utils.h            |   2 +-
 .../providers/xnnpack/nn/conv_transpose.cc    |   2 +-
 .../core/providers/xnnpack/tensor/resize.cc   |   4 +-
 .../test/providers/coreml/utils_test.mm       | 108 +++++++++
 .../cpu/nn/conv_transpose_op_test.cc          |   8 +-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 15 files changed, 511 insertions(+), 165 deletions(-)
 create mode 100644 onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
 create mode 100644 onnxruntime/test/providers/coreml/utils_test.mm

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 38ed0b1640192..0c1e5e93c6844 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -679,7 +679,10 @@ if(onnxruntime_USE_RKNPU)
 endif()
 
 if(onnxruntime_USE_COREML)
-  list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/coreml/*)
+  list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.cc)
+  if(APPLE)
+    list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.mm)
+  endif()
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml coreml_proto)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto)
   list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml coreml_proto)
diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
new file mode 100644
index 0000000000000..5b6d9d72ab3c9
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@@ -0,0 +1,218 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+using namespace CoreML::Specification;
+
+namespace onnxruntime {
+namespace coreml {
+
+class ConvTransposeOpBuilder : public BaseOpBuilder {
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */,
+                         const logging::Logger& /* logger */) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+};
+
+Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
+                                                     [[maybe_unused]] const Node& node,
+                                                     const logging::Logger& /*logger*/) const {
+#if defined(COREML_ENABLE_MLPROGRAM)
+  using namespace CoreML::Specification::MILSpec;  // NOLINT
+  const auto input_defs = node.InputDefs();
+  const auto output_defs = node.OutputDefs();
+  const auto& input_name = input_defs[0]->Name();
+
+  NodeAttrHelper helper(node);
+
+  // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.conv.conv_transpose
+  std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "conv_transpose");
+  const auto& op_type = op->type();
+
+  AddOperationInput(*op, "x", input_name);
+  AddOperationInput(*op, "weight", input_defs[1]->Name());
+
+  if (input_defs.size() > 2) {
+    AddOperationInput(*op, "bias", input_defs[2]->Name());
+  }
+
+  // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims.
+  const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2;
+
+  // Spec says strides/dilations/pads are optional but reality is they're required for at least the iOS15 target
+  // which is CoreML5. Due to that we just add everything for simplicity.
+  const auto strides = helper.Get("strides", std::vector<int64_t>(num_spatial_dims, 1));
+  const auto dilations = helper.Get("dilations", std::vector<int64_t>(num_spatial_dims, 1));
+
+  AddOperationInput(*op, "strides", model_builder.AddConstant(op_type, "strides", strides));
+  AddOperationInput(*op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations));
+
+  const std::optional<int64_t> groups = helper.GetInt64("group");
+  if (groups) {
+    AddOperationInput(*op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups));
+  }
+
+  // if we can enable output_shape, this code works. see IsOpSupportedImpl for the reason it's disabled.
+  // const auto output_shape = helper.GetInt64s("output_shape");
+  // if (output_shape) {
+  //  AddOperationInput(*op, "output_shape", model_builder.AddConstant(op_type, "output_shape", *output_shape));
+  //  // these are required despite the spec saying otherwise
+  //  AddOperationInput(*op, "pad_type", model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid")));
+  //  std::vector<int64_t> pads(num_spatial_dims * 2, 0);
+  //  AddOperationInput(*op, "pad", model_builder.AddConstant(op_type, "pad", pads));
+  //} else {
+  //  AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims);
+  //}
+
+  AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims);
+
+  AddOperationOutput(*op, *output_defs[0]);
+
+  model_builder.AddOperation(std::move(op));
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+
+  return Status::OK();
+}
+
+bool ConvTransposeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                               const logging::Logger& logger) const {
+  if (!input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: ML Program required";
+    return false;
+  }
+
+  // ML Program
+  // - const weight until CoreML7 (iOS17)
+  //   - require constant for now as non-const would be unusual and we rely on the shape of W to be known to validate
+  //     the kernel_shape can be used
+  // - const bias
+  // - const pad
+  //   - if auto_pad is same_upper or same_lower the output[i] - (input[i] * strides[i]) must be divisible by 2
+  //     as the pads must be equally split as there's no upper/lower option in CoreML
+  //     - punting on supporting this for now
+  //   - must be symmetric for CoreML to do the right thing
+  // - const strides/dilations/groups
+  // - output_shape CoreML output is inconsistent so disabled for now
+  //
+  // NOTE: need to test with/without the COREML_FLAG_USE_CPU_ONLY flag being set to get an idea of how flaky the CoreML
+  // behavior is.
+  // Update /onnxruntime/test/util/default_providers.cc:DefaultCoreMLExecutionProvider to do so
+
+  const auto& input_defs = node.InputDefs();
+
+  std::vector<int64_t> input_shape;
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    // requires the rank at least to be known
+    LOGS(logger, VERBOSE) << "ConvTranspose: failed to get input shape";
+    return false;
+  }
+
+  // for simplicity require weight to be constant
+  const auto& weight_arg = *input_defs[1];
+  const auto& weight_name = input_defs[1]->Name();
+  const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
+  if (!weight) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: weight must be constant";
+    return false;
+  }
+
+  if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: bias must be constant";
+    return false;
+  }
+
+  std::vector<int64_t> weight_shape;
+  if (!GetShape(weight_arg, weight_shape, logger)) {
+    // impossible as it's a constant initializer
+    LOGS(logger, VERBOSE) << "ConvTranspose: failed to get weight shape";
+    return false;
+  }
+
+  int64_t num_spatial_dims = narrow<int64_t>(weight_shape.size()) - 2;
+
+  NodeAttrHelper helper(node);
+
+  // Punt on SAME_UPPER/SAME_LOWER for now.
+  // We could infer that 'same' -> 'same_upper' based on the CoreML conv spec having 'same' and 'same_lower' but
+  // need to validate that assertion.
+  // Additionally, if the pads size is equal, there's no difference between same_upper and same_lower.
+  // To do that we'd need the 'output_shape' attribute to check against.
+  // Can add this handling if/when needed.
+  auto autopad = StringToAutoPadType(helper.Get("auto_pad", "NOTSET"));
+  if (autopad == AutoPadType::SAME_LOWER || autopad == AutoPadType::SAME_UPPER) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: support for SAME_LOWER/SAME_UPPER is not implemented yet";
+    return false;
+  } else if (autopad == AutoPadType::NOTSET) {
+    // CoreML output is inconsistent between CPU_ONLY and ALL if the pads aren't all the same value.
+    // CPU matches the expected output, but other devices don't seem to (at least on macOS).
+    auto onnx_pads = *helper.GetInt64s("pads");  // 'pads' are required if auto_pad is NOTSET
+    const auto pad_value = onnx_pads[0];
+    if (!std::all_of(onnx_pads.begin() + 1, onnx_pads.end(),
+                     [pad_value](auto value) { return value == pad_value; })) {
+      LOGS(logger, VERBOSE) << "ConvTranspose: all pad values must be the same for CoreML to return "
+                               "consistent results";
+      return false;
+    }
+  }
+
+  // there's no input to specify a kernel shape in CoreML.
+  // it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
+  auto kernel_shape = helper.GetInt64s("kernel_shape");
+  if (kernel_shape) {
+    bool valid = true;
+
+    if (static_cast<int64_t>(kernel_shape->size()) == num_spatial_dims) {
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        // check the specified kernel shape matches the weight shape. skip the initial N and C dims in the latter.
+        if ((*kernel_shape)[i] != weight_shape[i + 2]) {
+          valid = false;
+          break;
+        }
+      }
+    } else {
+      valid = false;
+    }
+
+    if (!valid) {
+      LOGS(logger, VERBOSE) << "ConvTranspose: kernel_shape attribute does not match the weight shape";
+      return false;
+    }
+  }
+
+  // In theory this can be supported, but running with COREML_FLAG_USE_CPU_ONLY produces output that doesn't match
+  // ONNX. Running without that flag produces the expected output. Madness...
+  auto output_shape = helper.GetInt64s("output_shape");
+  if (output_shape) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: output_shape is not supported as the CoreML output is inconsistent";
+    return false;
+  }
+
+  // output_padding, if specified, must be the default value of all zeros as there's no equivalent in CoreML.
+  auto output_padding = helper.GetInt64s("output_padding");
+  if (output_padding &&
+      std::any_of(output_padding->begin(), output_padding->end(), [](auto value) { return value != 0; })) {
+    LOGS(logger, VERBOSE) << "ConvTranspose: output_padding is not supported";
+    return false;
+  }
+
+  return true;
+}
+
+void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<ConvTransposeOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 65b5c17f2c6a6..7ff66e4a79e37 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -427,13 +427,13 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
         auto h_in = input_shape[input_rank - 2];
         auto w_in = input_shape[input_rank - 1];
 
-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h)) {
           LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_h
                                 << " is not a factor of input height: " << h_in;
           return false;
         }
 
-        if (!utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
           LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_w
                                 << " is not a factor of input width: " << w_in;
           return false;
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index b17827f8e0532..535712f096010 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -15,120 +15,56 @@ namespace coreml {
 static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   OpBuilderRegistrations op_registrations;
 
-  {  // Add/Mul/Pow/Sub/Div
-    CreateBinaryOpBuilder("Add", op_registrations);
-    CreateBinaryOpBuilder("Mul", op_registrations);
-    CreateBinaryOpBuilder("Pow", op_registrations);
-    CreateBinaryOpBuilder("Sub", op_registrations);
-    CreateBinaryOpBuilder("Div", op_registrations);
-  }
-
-  {  // Activations
-    CreateActivationOpBuilder("Sigmoid", op_registrations);
-    CreateActivationOpBuilder("Tanh", op_registrations);
-    CreateActivationOpBuilder("Relu", op_registrations);
-    CreateActivationOpBuilder("PRelu", op_registrations);
-    CreateActivationOpBuilder("LeakyRelu", op_registrations);
-  }
-
-  {  // Transpose
-    CreateTransposeOpBuilder("Transpose", op_registrations);
-  }
-
-  {  // Conv
-    CreateConvOpBuilder("Conv", op_registrations);
-  }
-
-  {  // Batch Normalization
-    CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations);
-  }
-
-  {  // Reshape
-    CreateReshapeOpBuilder("Reshape", op_registrations);
-  }
-
-  {  // DepthToSpace
-    CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations);
-  }
-
-  {  // Pool
-    CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
-    CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
-    CreatePoolOpBuilder("AveragePool", op_registrations);
-    CreatePoolOpBuilder("MaxPool", op_registrations);
-  }
-
-  {  // Concat
-    CreateConcatOpBuilder("Concat", op_registrations);
-  }
-
-  {  // Resize
-    CreateResizeOpBuilder("Resize", op_registrations);
-  }
-
-  {  // Gemm/MatMul
-    CreateGemmOpBuilder("Gemm", op_registrations);
-    CreateGemmOpBuilder("MatMul", op_registrations);
-  }
-
-  {  // Clip
-    CreateClipOpBuilder("Clip", op_registrations);
-  }
-
-  {  // Squeeze
-    CreateSqueezeOpBuilder("Squeeze", op_registrations);
-  }
-
-  {  // ArgMax
-    CreateArgMaxOpBuilder("ArgMax", op_registrations);
-  }
-
-  {  // Cast
-    CreateCastOpBuilder("Cast", op_registrations);
-  }
-
-  {  // Flatten
-    CreateFlattenOpBuilder("Flatten", op_registrations);
-  }
-
-  {  // LRN
-    CreateLRNOpBuilder("LRN", op_registrations);
-  }
-
-  {  // Pad
-    CreatePadOpBuilder("Pad", op_registrations);
-  }
-
-  {  // Unary
-    CreateUnaryOpBuilder("Sqrt", op_registrations);
-    CreateUnaryOpBuilder("Reciprocal", op_registrations);
-  }
-
-  {  // Reduction
-     // ReduceMean is used in layer normalization which seems to be problematic in Python tests.
-    CreateReductionOpBuilder("ReduceMean", op_registrations);
-    CreateReductionOpBuilder("ReduceSum", op_registrations);
-  }
-
-  {  // Shape
-    CreateShapeOpBuilder("Shape", op_registrations);
-  }
-
-  {  // Gather
-    CreateGatherOpBuilder("Gather", op_registrations);
-  }
-
-  {  // Slice
-    CreateSliceOpBuilder("Slice", op_registrations);
-  }
-
-  {  // Softmax
-    CreateSoftmaxOpBuilder("Softmax", op_registrations);
-  }
-
-  {  // Split
-    CreateSplitOpBuilder("Split", op_registrations);
-  }
+  // Unary ops
+  CreateUnaryOpBuilder("Sqrt", op_registrations);
+  CreateUnaryOpBuilder("Reciprocal", op_registrations);
+
+  // Binary elementwise ops
+  CreateBinaryOpBuilder("Add", op_registrations);
+  CreateBinaryOpBuilder("Mul", op_registrations);
+  CreateBinaryOpBuilder("Pow", op_registrations);
+  CreateBinaryOpBuilder("Sub", op_registrations);
+  CreateBinaryOpBuilder("Div", op_registrations);
+
+  // Activations
+  CreateActivationOpBuilder("Sigmoid", op_registrations);
+  CreateActivationOpBuilder("Tanh", op_registrations);
+  CreateActivationOpBuilder("Relu", op_registrations);
+  CreateActivationOpBuilder("PRelu", op_registrations);
+  CreateActivationOpBuilder("LeakyRelu", op_registrations);
+
+  // Pooling ops
+  CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
+  CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
+  CreatePoolOpBuilder("AveragePool", op_registrations);
+  CreatePoolOpBuilder("MaxPool", op_registrations);
+
+  // Reduction ops
+  CreateReductionOpBuilder("ReduceMean", op_registrations);
+  CreateReductionOpBuilder("ReduceSum", op_registrations);
+
+  CreateArgMaxOpBuilder("ArgMax", op_registrations);
+  CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations);
+  CreateCastOpBuilder("Cast", op_registrations);
+  CreateClipOpBuilder("Clip", op_registrations);
+  CreateConcatOpBuilder("Concat", op_registrations);
+  CreateConvOpBuilder("Conv", op_registrations);
+  CreateConvTransposeOpBuilder("ConvTranspose", op_registrations);
+  CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations);
+  CreateFlattenOpBuilder("Flatten", op_registrations);
+  CreateGatherOpBuilder("Gather", op_registrations);
+  CreateGemmOpBuilder("Gemm", op_registrations);
+  CreateLRNOpBuilder("LRN", op_registrations);
+  CreateGemmOpBuilder("MatMul", op_registrations);
+  CreatePadOpBuilder("Pad", op_registrations);
+  CreateReshapeOpBuilder("Reshape", op_registrations);
+  CreateResizeOpBuilder("Resize", op_registrations);
+  CreateShapeOpBuilder("Shape", op_registrations);
+  CreateSliceOpBuilder("Slice", op_registrations);
+  CreateSplitOpBuilder("Split", op_registrations);
+  CreateSoftmaxOpBuilder("Softmax", op_registrations);
+  CreateSqueezeOpBuilder("Squeeze", op_registrations);
+  CreateTransposeOpBuilder("Transpose", op_registrations);
 
   CreateGridSampleOpBuilder("GridSample", op_registrations);
 
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index a9a8ab90b0863..1990fb6400ce1 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -24,6 +24,7 @@ void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index c4c3b38bba516..75b9aaf2185c9 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -13,6 +13,10 @@
 #include "core/common/status.h"
 #include "core/platform/ort_mutex.h"
 
+#if defined(__OBJC__)
+@class MLMultiArray;
+#endif
+
 namespace onnxruntime {
 namespace coreml {
 
@@ -32,6 +36,15 @@ using GetOutputTensorMutableRawDataFn = std::function<void*(const std::string& n
                                                             int32_t requested_onnx_tensor_element_type,
                                                             gsl::span<const int64_t> static_shape)>;
 
+#if defined(__OBJC__)
+// helper function that we unit test.
+// Handles an MLMultiArray that is contiguous, or has one non-contiguous dimension.
+// The output values can be used to copy the array data to a contiguous buffer.
+// Loop num_blocks times, copying block_size elements each time, moving stride elements between copies.
+// A contiguous array will have num_blocks == 1, block_size == total_size (i.e. can be copied in a single operation)
+Status GetMLMultiArrayCopyInfo(const MLMultiArray* array, int64_t& num_blocks, int64_t& block_size, int64_t& stride);
+#endif
+
 class Model {
  public:
   Model(const std::string& path,
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 1d506099b4367..4fd822f0d0d15 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -174,51 +174,69 @@ Status CreateInputFeatureProvider(const std::unordered_map<std::string, OnnxTens
   return Status::OK();
 }
 
-bool IsArrayContiguous(const MLMultiArray* array) {
-  int64_t batch_stride = [array.strides[0] longLongValue];
-  const auto* shape = array.shape;
-  int64_t batch_elems = 1;
-  for (unsigned long i = 1; i < shape.count; i++) batch_elems *= [shape[i] longLongValue];
-  return batch_stride == batch_elems;
-}
-
 Status CopyMLMultiArrayBuffer(const void* mlmultiarray_buffer, void* tensor_buffer,
-                              const MLMultiArray* array_info,
-                              const OnnxTensorInfo* tensor_info,
-                              const std::optional<unsigned long> mlmultiarray_buffer_size) {
+                              const MLMultiArray* array,
+                              const int64_t num_blocks, const int64_t block_size, const int64_t stride,
+                              const OnnxTensorInfo* tensor_info) {
   if (mlmultiarray_buffer == nullptr) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "mlmultiarray_buffer has no data");
   }
 
-  const size_t num_elements = array_info.count;
+  // total including non-contiguous space
+
+  int64_t array_total_elements = [array.strides[0] longLongValue] * [array.shape[0] longLongValue];
+  const int64_t num_elements = array.count;
+
+  ORT_RETURN_IF(array_total_elements != num_blocks * stride ||
+                    num_elements != num_blocks * block_size,
+                "MLMultiArray size does not match the copy info");
+
   const auto onnx_data_type = tensor_info->data_type;
   switch (onnx_data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: {
-      const auto output_data_byte_size = num_elements * sizeof(float);
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size,
-                        "CoreML output buffer size and expected output size differ");
-      memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size);
+      const auto* src_buffer = static_cast<const float*>(mlmultiarray_buffer);
+      auto* dst_buffer = static_cast<float*>(tensor_buffer);
+      const auto block_byte_size = block_size * sizeof(float);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        memcpy(dst_buffer, src_buffer, block_byte_size);
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      const auto output_data_byte_size = num_elements * sizeof(int32_t);
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size,
-                        "CoreML output buffer size and expected output size differ");
-      memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size);
+      const auto* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
+      auto* dst_buffer = static_cast<int32_t*>(tensor_buffer);
+      const auto block_byte_size = block_size * sizeof(int32_t);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        memcpy(dst_buffer, src_buffer, block_byte_size);
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
+
       break;
     }
     // For this case, since Coreml Spec only uses int32 for model output while onnx provides
     // int64 for model output data type. We are doing a type casting (int32 -> int64) here
     // when copying the model to ORT
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      ORT_RETURN_IF_NOT(array_info.dataType == MLMultiArrayDataTypeInt32,
-                        "CoreML output data type is not MLMultiArrayDataTypeInt32");
-      ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == num_elements * sizeof(int32_t),
-                        "CoreML output buffer size and expected output size differ");
-      const auto model_output_span = gsl::span{static_cast<const int32_t*>(mlmultiarray_buffer), num_elements};
-      const auto output_span = gsl::span{static_cast<int64_t*>(tensor_buffer), num_elements};
-      std::transform(model_output_span.begin(), model_output_span.end(), output_span.begin(),
-                     [](int32_t v) { return static_cast<int64_t>(v); });
+      ORT_RETURN_IF(array.dataType != MLMultiArrayDataTypeInt32,
+                    "CoreML output data type is not MLMultiArrayDataTypeInt32");
+
+      const int32_t* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
+      int64_t* dst_buffer = static_cast<int64_t*>(tensor_buffer);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        auto input_span = gsl::span{src_buffer, static_cast<size_t>(block_size)};
+        auto output_span = gsl::span{dst_buffer, static_cast<size_t>(block_size)};
+        std::transform(input_span.begin(), input_span.end(), output_span.begin(),
+                       [](int32_t v) { return static_cast<int64_t>(v); });
+
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
       break;
     }
     default:
@@ -250,8 +268,7 @@ - (void)dealloc;
 - (Status)loadModel API_AVAILABLE_COREML3;
 - (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
                   outputs:(const std::unordered_map<std::string, OnnxTensorInfo>&)outputs
-    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)
-                              get_output_tensor_mutable_raw_data_fn
+    getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)get_output_tensor_mutable_raw_data_fn
     API_AVAILABLE_COREML3;
 
 @property(nullable) MLModel* model API_AVAILABLE_COREML3;
@@ -397,21 +414,27 @@ - (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
                                  ") do not match");
         }
 
-        ORT_RETURN_IF_NOT(IsArrayContiguous(data),
-                          "Non-contiguous output MLMultiArray is not currently supported");
+        // support a non-contiguous array, provided only one dimension is not contiguous
+        int64_t num_blocks = 0;
+        int64_t block_size = 0;
+        int64_t stride = 0;
+
+        ORT_RETURN_IF_ERROR(GetMLMultiArrayCopyInfo(data, num_blocks, block_size, stride));
+
         __block Status copy_status;
         const auto* tensor_info = &output_tensor_info;
         // `getBytesWithHandler` replaces deprecated `.dataPointer` on new versions
         if (@available(macOS 12.3, iOS 15.4, *)) {
           [data getBytesWithHandler:^(const void* bytes, NSInteger size) {
-            copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data, tensor_info, size);
+            copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data,
+                                                 num_blocks, block_size, stride, tensor_info);
           }];
         } else {
-          // disable size check as old API does not return buffer length
-          copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data, tensor_info, std::nullopt);
+          copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data,
+                                               num_blocks, block_size, stride, tensor_info);
         }
-        if (!copy_status.IsOK())
-          return copy_status;
+
+        ORT_RETURN_IF_ERROR(copy_status);
       }
     }
   }
@@ -431,6 +454,49 @@ - (Status)predict:(const std::unordered_map<std::string, OnnxTensorData>&)inputs
 namespace onnxruntime {
 namespace coreml {
 
+Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
+                               int64_t& num_blocks, int64_t& block_size, int64_t& stride) {
+  const auto* shape = array.shape;
+  const auto rank = shape.count;
+
+  int64_t array_total_elements = [array.strides[0] longLongValue] * [shape[0] longLongValue];
+
+  int64_t data_elems = 1;   // actual values
+  int64_t total_elems = 1;  // elems including empty slots if non-contiguous
+  for (unsigned long i = 1; i <= rank; i++) {
+    int64_t this_stride = [array.strides[rank - i] longLongValue];
+    if (this_stride != total_elems) {
+      // non-contiguous
+      if (block_size != 0) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                               "Multiple non-contiguous dimensions in MLMultiArray are not supported.");
+      }
+
+      block_size = data_elems;
+      stride = this_stride;
+    }
+
+    const auto elems_this_dim = [shape[rank - i] longLongValue];
+    data_elems *= elems_this_dim;
+    total_elems = elems_this_dim * this_stride;
+  }
+
+  if (block_size == 0) {
+    // all data is contiguous
+    block_size = data_elems;
+    stride = array_total_elements;
+    assert(block_size == stride);
+  }
+
+  num_blocks = data_elems / block_size;
+
+  ORT_ENFORCE(array_total_elements == total_elems, "Logic error calculating copy info");
+  ORT_ENFORCE(stride >= block_size, "Logic error calculating copy info");
+  ORT_ENFORCE(stride * num_blocks == total_elems, "Logic error calculating copy info");
+
+  return Status::OK();
+}
+
 // Internal Execution class
 // This class will bridge Model (c++) with CoreMLExecution (objective c++)
 class Execution {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index ef27f6c942f44..44403010c936c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -274,8 +274,8 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const N
           return false;
         }
 
-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
-            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) ||
+            !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
           LOGS_DEFAULT(VERBOSE) << "Input size must be evenly divisible by output size when downsampling";
           return false;
         }
diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc
index 747b09e42aa21..2725af95e0959 100644
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@@ -24,7 +24,7 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
 }
 #endif
 
-bool IsScalingByAFactorOfN(int64_t n, float scale) {
+bool ReciprocalIsAFactorOfN(int64_t n, float scale) {
   bool is_factor = false;
   if (scale > 0.f && scale < 1.f) {
     const double factor = 1.0 / scale;
diff --git a/onnxruntime/core/providers/utils.h b/onnxruntime/core/providers/utils.h
index 9ea8496a02f85..cfd71d9b838b3 100644
--- a/onnxruntime/core/providers/utils.h
+++ b/onnxruntime/core/providers/utils.h
@@ -19,6 +19,6 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
 /// Check if the reciprocal of 'scale' is a factor of 'n'.
 ///   e.g. a scale of 0.5 is 1/2, the reciprocal is 2, and 2 is a factor of any even number.
 /// </summary>
-bool IsScalingByAFactorOfN(int64_t n, float scale);
+bool ReciprocalIsAFactorOfN(int64_t n, float scale);
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
index c136385f12476..01c8119fea79d 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
@@ -24,7 +24,7 @@ Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
     const auto rank = orig_shape.NumDimensions();
 
     if (conv_transpose_attrs_.group > 1) {
-      // Xnnpack [G, Oc, H, W Ic/G]
+      // Xnnpack [G, Oc, H, W, Ic/G]
       // (ref: https://github.com/google/XNNPACK/blob/ecd8311c8fd3d9ab47edbc3df5f2b5de7dabe75f/test/deconvolution-operator-tester.h#L678)
       if (rank == 4) {
         // split C (dim 0) into {group, C/group}
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index c752b5f849808..cf874796ba169 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -85,8 +85,8 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
 
         float scale_h = scales[2];
         float scale_w = scales[3];
-        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
-            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+        if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) ||
+            !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) {
           break;
         }
       }
diff --git a/onnxruntime/test/providers/coreml/utils_test.mm b/onnxruntime/test/providers/coreml/utils_test.mm
new file mode 100644
index 0000000000000..f55f108494e3e
--- /dev/null
+++ b/onnxruntime/test/providers/coreml/utils_test.mm
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#import <CoreML/CoreML.h>
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+#include "core/providers/coreml/model/model.h"
+#include "test/util/include/asserts.h"
+
+namespace onnxruntime {
+namespace test {
+namespace {
+auto ValidateGetInfo(MLMultiArray* array,
+                     int64_t expected_num_blocks, int64_t expected_block_size, int64_t expected_stride,
+                     bool expect_valid) {
+  int64_t num_blocks = 0;
+  int64_t block_size = 0;
+  int64_t stride = 0;
+  auto status = coreml::GetMLMultiArrayCopyInfo(array, num_blocks, block_size, stride);
+
+  if (!expect_valid) {
+    ASSERT_STATUS_NOT_OK(status);
+    return;
+  }
+
+  ASSERT_STATUS_OK(status);
+  ASSERT_EQ(num_blocks, expected_num_blocks);
+  ASSERT_EQ(block_size, expected_block_size);
+  ASSERT_EQ(stride, expected_stride);
+}
+}  // namespace
+
+TEST(CoreMLUtils, GetMLMultiArrayReadInfo) {
+  // fake pointer. we don't read any data but initWithDataPointer requires a non-null address
+  void* data = reinterpret_cast<void*>(0xfeedf00d);
+
+  // a dim is non-contiguous if the stride is > the total number of elements in its inner dimensions
+
+  // dim -1 with non-contiguous data. 1 element (as it's the inner-most dimension) but the stride is 2.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @1, @8, @8 ];
+    NSArray<NSNumber*>* strides = @[ @128, @128, @16, @2 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+    ValidateGetInfo(array, 64, 1, 2, true);
+  }
+
+  // dim -2 with non-contiguous data. 8 elements in the inner dimension but the stride is 16.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @1, @8, @8 ];
+    NSArray<NSNumber*>* strides = @[ @128, @128, @16, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+    ValidateGetInfo(array, 8, 8, 16, true);
+  }
+
+  // dim -3 with non-contiguous data. 16 elements in the inner dimensions but stride is 24.
+  {
+    NSArray<NSNumber*>* shape = @[ @1, @2, @4, @4 ];
+    NSArray<NSNumber*>* strides = @[ @48, @24, @4, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+
+    ValidateGetInfo(array, 2, 16, 24, true);
+  }
+
+  // two non-contiguous dims (dim -2 and dim -3)
+  // dim -2 has 4 elements in the inner dimension and stride of 8
+  // dim -3 has 32 elements in the inner dimensions (we need to include the empty elements from the non-contiguous data
+  // in dim -2) and stride of 48
+  {
+    // dim
+    NSArray<NSNumber*>* shape = @[ @1, @2, @4, @4 ];
+    NSArray<NSNumber*>* strides = @[ @96, @48, @8, @1 ];
+
+    auto* array = [[MLMultiArray alloc] initWithDataPointer:data
+                                                      shape:shape
+                                                   dataType:MLMultiArrayDataTypeInt32
+                                                    strides:strides
+                                                deallocator:^(void* /* bytes */) {
+                                                }
+                                                      error:nil];
+
+    ValidateGetInfo(array, 0, 0, 0, false);
+  }
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
index 81191e9b48c3c..2bf53ce5b5986 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
@@ -27,7 +27,7 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes,
                                     const vector<vector<int64_t>>& input_shapes,
                                     const std::initializer_list<float>& expected_output,
                                     const vector<int64_t>& expected_output_shape,
-                                    bool is_filter_initializer = false,
+                                    bool is_weight_and_bias_initializer = false,
                                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
                                     const std::string& err_str = "",
                                     const std::unordered_set<std::string>& excluded_provider_types = {kTensorrtExecutionProvider}) {
@@ -58,10 +58,10 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes,
   }
 
   ORT_ENFORCE(inputs.size() <= 3, "Our name array is only setup to handle 3 inputs");
-  const char* szNames[] = {"X", "W", "B"};
-  bool isInitializers[] = {false, is_filter_initializer, false};
+  const char* input_names[] = {"X", "W", "B"};
+  bool is_initializers[] = {false, is_weight_and_bias_initializer, is_weight_and_bias_initializer};
   for (size_t i = 0; i < inputs.size(); i++) {
-    test.AddInput<float>(szNames[i], input_shapes[i], inputs[i], isInitializers[i]);
+    test.AddInput<float>(input_names[i], input_shapes[i], inputs[i], is_initializers[i]);
   }
   test.AddOutput<float>("Y", expected_output_shape, expected_output);
 
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index b65b0f64686a9..5609033fc3e35 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -7,6 +7,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Clip||
 |ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
+|ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
 |ai.onnx:Div||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|

From b04adcc3816b898f27242aaf0cae1d847c0dc988 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 24 Jul 2024 10:02:00 -0700
Subject: [PATCH 09/57] Update copy_strip_binary.sh: use "make install" instead
 (#21464)

### Description
Before this change, copy_strip_binary.sh manually copies each file from
onnx runtime's build folder to an artifact folder. It can be hard when
dealing with symbolic link for shared libraries.
This PR will change the packaging pipelines to run "make install" first,
before packaging shared libs .


### Motivation and Context

Recently because of feature request #21281 , we changed
libonnxruntime.so's SONAME. Now every package that contains this shared
library must also contains libonnxruntime.so.1. Therefore we need to
change the packaging scripts to include this file. Instead of manually
construct the symlink layout, using `make install` is much easier and
will make things more consistent because it is a standard way of making
packages.

**Breaking change:**
After this change, our **inference** tarballs that are published to our
Github release pages will be not contain ORT **training** headers.
---
 cmake/onnxruntime.cmake                       |  1 +
 cmake/onnxruntime_framework.cmake             |  4 +-
 .../core/optimizer/graph_transformer_utils.cc |  4 +-
 .../templates/c-api-linux-cpu.yml             |  2 +-
 .../templates/final-jar-testing.yml           |  3 +-
 .../templates/mac-cpu-packaging-steps.yml     |  3 ++
 .../github/linux/build_cuda_c_api_package.sh  |  9 +---
 .../github/linux/build_rocm_c_api_package.sh  |  9 +---
 .../linux/build_tensorrt_c_api_package.sh     |  4 +-
 .../github/linux/copy_strip_binary.sh         | 42 ++++---------------
 10 files changed, 25 insertions(+), 56 deletions(-)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 0e89c2f14d34b..bdb4b00b02a35 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -27,6 +27,7 @@ function(get_c_cxx_api_headers HEADERS_VAR)
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h"
     "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h"
+    "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h"
   )
 
   if (onnxruntime_ENABLE_TRAINING_APIS)
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index 43d16abd8fbae..b85edbf37d447 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -123,7 +123,9 @@ if (WIN32)
   target_compile_definitions(onnxruntime_framework PRIVATE _SCL_SECURE_NO_WARNINGS)
 endif()
 
-if (NOT onnxruntime_BUILD_SHARED_LIB)
+if (onnxruntime_BUILD_SHARED_LIB)
+  install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework/provider_options.h  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
+else()
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
   install(TARGETS onnxruntime_framework
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 7da65f18ccacb..ab1dbaea7b7fd 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -189,7 +189,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     const SessionOptions& session_options,
     const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
-    concurrency::ThreadPool* intra_op_thread_pool) {
+    [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool disable_quant_qdq =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
@@ -419,7 +419,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
-    concurrency::ThreadPool* intra_op_thread_pool) {
+    [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool saving = std::holds_alternative<SatRuntimeOptimizationSaveContext>(apply_context);
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 041ea623ecf61..e2b71c5c55fd2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -69,7 +69,7 @@ jobs:
           docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \
           /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
-          --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}"
+          --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/installed"
         workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
index c9b7c01146981..abc96601ffb6c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml
@@ -68,8 +68,9 @@ stages:
         inputs:
           targetType: 'inline'
           script: |
+            set -e -x
             echo "Java Version"
-            java --version
+            java -version
             mkdir test
             pushd test
             jar xf '$(Build.BinariesDirectory)/final-jar/testing.jar'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
index 7672b604a5268..84f517a81686d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
@@ -46,8 +46,11 @@ steps:
     ChangeEveryCommit: true
     BuildStep:
       - script: |
+          set -e -x
           rm -rf $(Build.BinariesDirectory)/Release
           python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --update --build ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release
+          cd $(Build.BinariesDirectory)/Release
+          make install DESTDIR=$(Build.BinariesDirectory)/installed
         displayName: 'Build ${{ parameters.MacosArch }}'
         env:
           CCACHE_DIR: ${{ parameters.CacheDir }}
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 04968aacdb255..57a3bedc1e8e4 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -1,10 +1,5 @@
 #!/bin/bash
 set -e -x
 docker run --rm --volume \
-$BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
---volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
-/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
---skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
---cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
---skip_tests \
---cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
+$BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
+/bin/bash -c "/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed"
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index d70442ad2cae8..9fee565170a1b 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -31,14 +31,7 @@ docker run --rm \
   --volume /data/onnx:/data/onnx:ro \
   --workdir /onnxruntime_src \
   $IMAGE \
-  ${PYTHON_BIN:-python} /onnxruntime_src/tools/ci_build/build.py \
-    --config Release \
-    --build_dir /build \
-    --parallel \
-    --use_rocm --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME \
-    --build_shared_lib \
-    --skip_submodule_sync \
-    --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER
+  /bin/bash -c "${PYTHON_BIN:-python} /onnxruntime_src/tools/ci_build/build.py --config Release --build_dir /build --parallel --use_rocm --use_binskim_compliant_compile_flags --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME --build_shared_lib --skip_submodule_sync --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER && cd /build/Release && make install DESTDIR=/build/installed"
 
 
 EXIT_CODE=$?
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index cc63b68d441d7..f0c9d51a53448 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -3,6 +3,4 @@ set -e -x
 mkdir -p $HOME/.onnx
 docker run --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
-/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
---skip_tests \
---skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
+/bin/bash -c "/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed"
diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index 65d6d97ebf0a8..f5b4c38c85d4c 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -16,46 +16,22 @@ done
 EXIT_CODE=1
 
 uname -a
-mkdir $BINARY_DIR/$ARTIFACT_NAME
-mkdir $BINARY_DIR/$ARTIFACT_NAME/lib
-mkdir $BINARY_DIR/$ARTIFACT_NAME/include
-echo "Directories created"
-cp $BINARY_DIR/$BUILD_CONFIG/$LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib
-if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then
-    cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_shared.so $BINARY_DIR/$ARTIFACT_NAME/lib
-    cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so $BINARY_DIR/$ARTIFACT_NAME/lib
-fi
-if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_tensorrt.so" ]]; then
-    cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_tensorrt.so $BINARY_DIR/$ARTIFACT_NAME/lib
-fi
-if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_rocm.so" ]]; then
-    cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_shared.so $BINARY_DIR/$ARTIFACT_NAME/lib
-    cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_rocm.so $BINARY_DIR/$ARTIFACT_NAME/lib
-fi
+cd "$BINARY_DIR"
+mv installed/usr/local $ARTIFACT_NAME
+mv $ARTIFACT_NAME/include/onnxruntime/* $ARTIFACT_NAME/include
+rmdir $ARTIFACT_NAME/include/onnxruntime
+# Do not ship onnx_test_runner
+rm -rf $ARTIFACT_NAME/bin
 echo "Copy debug symbols in a separate file and strip the original binary."
 if [[ $LIB_NAME == *.dylib ]]
 then
     dsymutil $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME -o $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME.dSYM
     strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME
-    ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib
     # copy the CoreML EP header for macOS build (libs with .dylib ext)
     cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
-elif [[ $LIB_NAME == *.so.* ]]
-then
-    ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so
-fi
-cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_*.h $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/framework/provider_options.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
-cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_*.h  $BINARY_DIR/$ARTIFACT_NAME/include
-
-if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then
-# copy headers for context context used in custom ops
-mkdir -p $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda
-cp $SOURCE_DIR/include/onnxruntime/core/providers/custom_op_context.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/custom_op_context.h
-cp $SOURCE_DIR/include/onnxruntime/core/providers/resource.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/resource.h
-cp $SOURCE_DIR/include/onnxruntime/core/providers/cuda/cuda_context.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda/cuda_context.h
-cp $SOURCE_DIR/include/onnxruntime/core/providers/cuda/cuda_resource.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda/cuda_resource.h
+else
+   # Linux
+   mv $ARTIFACT_NAME/lib64 $ARTIFACT_NAME/lib
 fi
 
 # copy the README, licence and TPN

From eb9b377306c941b01d2823b7655f372a20b82197 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 24 Jul 2024 10:17:12 -0700
Subject: [PATCH 10/57] [QNN EP] Update to QNN SDK 2.24.0 (#21463)

### Description
- Update pipelines to use QNN SDK 2.24 by default
- Update QNN_Nuget_Windows pipeline to build csharp solution without
mobile projects (fixes errors).
- Implement workaround for QNN 2.24 validation bug for LayerNorm ops
without an explicit bias input.
- Enable Relu unit test, which now passes due to the fact Relu is no
longer fused into QuantizeLinear for QNN EP.
- Fix bug where a negative quantization axis is not properly normalized
for per-channel int4 conv.


### Motivation and Context
Update QNN SDk.
---
 .../opbuilder/layer_norm_op_builder.cc        |  92 +++++++++++++++
 .../qnn/builder/qnn_model_wrapper.cc          |  10 ++
 .../qnn/builder/qnn_quant_params_wrapper.cc   | 107 +++++++++++++++++-
 .../qnn/builder/qnn_quant_params_wrapper.h    |   7 ++
 onnxruntime/test/providers/qnn/conv_test.cc   |  38 ++++++-
 .../test/providers/qnn/layer_norm_test.cc     |  79 +++++++++++--
 .../test/providers/qnn/qnn_test_utils.h       |   6 +
 .../test/providers/qnn/simple_op_htp_test.cc  |   2 +-
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |   2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |   2 +-
 .../azure-pipelines/linux-qnn-ci-pipeline.yml |   2 +-
 .../azure-pipelines/py-packaging-pipeline.yml |   2 +-
 .../qnn-ep-nuget-packaging-pipeline.yml       |   2 +-
 .../templates/jobs/download_linux_qnn_sdk.yml |   2 +-
 .../templates/jobs/download_win_qnn_sdk.yml   |   2 +-
 .../templates/py-packaging-stage.yml          |   2 +-
 .../templates/py-win-arm64-qnn.yml            |   2 +-
 .../templates/py-win-x64-qnn.yml              |   2 +-
 .../azure-pipelines/templates/qnn-ep-win.yml  |   6 +-
 .../win-qnn-arm64-ci-pipeline.yml             |   2 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |   2 +-
 21 files changed, 339 insertions(+), 32 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index a2dca669c24f6..c667aeeaa61f0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -1,9 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <cassert>
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/common/safeint.h"
@@ -24,6 +26,11 @@ class LayerNormOpBuilder : public BaseOpBuilder {
                        const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;
 
  protected:
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
   Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                      const NodeUnit& node_unit,
                                      std::vector<std::string>&& input_names,
@@ -55,6 +62,91 @@ Status LayerNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
   return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
 }
 
+Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                         const NodeUnit& node_unit,
+                                         const logging::Logger& logger,
+                                         std::vector<std::string>& input_names,
+                                         bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+
+  const auto& inputs = node_unit.Inputs();
+  const auto input_count = inputs.size();
+  constexpr size_t X_IDX = 0;
+  constexpr size_t SCALE_IDX = 1;
+  constexpr size_t BIAS_IDX = 2;
+
+  // Input[0] (X, required)
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[X_IDX], logger, input_names));
+
+  // Input[1] (scale, required)
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[SCALE_IDX], logger, input_names));
+
+  // Input[2] (bias, optional)
+  const bool has_bias_input = input_count > BIAS_IDX && inputs[BIAS_IDX].node_arg.Exists();
+  if (has_bias_input) {
+    ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names));
+  }
+
+#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR == 17
+  if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
+    // Bias is implicit. QNN SDK 2.24 (QNN API version 2.17) has a validation bug for implicit bias inputs, so provide
+    // an explicit bias of all 0 (quantized int32).
+    TensorInfo x_input_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info));
+
+    TensorInfo scale_input_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[SCALE_IDX], scale_input_info));
+
+    if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
+      const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";
+
+      // Make dummy bias input have the same shape as the scale input.
+      std::vector<uint32_t> bias_shape = scale_input_info.shape;
+      size_t num_bias_elems = 1;
+      for (size_t i = 0; i < bias_shape.size(); i++) {
+        num_bias_elems *= static_cast<size_t>(bias_shape[i]);
+      }
+
+      // Bias static input should be all zeros.
+      std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);
+
+      // Bias's quantization scale should be the product of the other inputs' quantization scales.
+      std::vector<float> input0_quant_scales;
+      std::vector<float> input1_quant_scales;
+      ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
+      ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));
+
+      const size_t num_bias_scales_offsets = input1_quant_scales.size();
+      assert(input0_quant_scales.size() == 1);  // Expected for per-tensor.
+      ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
+                        "Input[1] should have >= 1 quantization scale values");
+
+      std::vector<float> bias_scales(num_bias_scales_offsets);
+      for (size_t i = 0; i < num_bias_scales_offsets; i++) {
+        bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
+      }
+
+      std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0);  // Bias's zero-points should be all zeros.
+      QnnQuantParamsWrapper bias_qparams;
+
+      if (scale_input_info.quant_param.IsPerChannel()) {
+        bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
+      } else {
+        bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
+      }
+
+      auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
+                                             std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));
+
+      qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
+      input_names.push_back(bias_name);
+    }
+  }
+#endif
+
+  return Status::OK();
+}
+
 Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                        const NodeUnit& node_unit,
                                                        std::vector<std::string>&& input_names,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index f85cdc401a152..c8537307ef3ba 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -442,6 +442,16 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
 
   if (is_per_channel) {
     axis = io_def.quant_param->axis.value_or(1);  // 1 is default axis for Q/DQ ops.
+    if (axis < 0) {
+      // Normalize negative axis by adding rank.
+      const auto* tensor_shape_proto = io_def.node_arg.Shape();
+      ORT_RETURN_IF_NOT(tensor_shape_proto != nullptr, "NULL tensor shape proto");
+
+      const int rank = tensor_shape_proto->dim_size();
+      ORT_RETURN_IF_NOT(rank > 0, "Per-channel quantized tensor should be of rank > 0");
+
+      axis += rank;
+    }
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index 2d22c3c1b8226..da2d517f65697 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -30,6 +30,7 @@ QnnQuantParamsWrapper& QnnQuantParamsWrapper::operator=(const QnnQuantParamsWrap
   return *this;
 }
 
+// Construct per-tensor quantization params.
 QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) {
   params_.encodingDefinition = QNN_DEFINITION_DEFINED;
   params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
@@ -37,6 +38,110 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) {
   params_.scaleOffsetEncoding.offset = offset;
 }
 
+// Construct a per-channel quantization param.
+QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                                             int32_t axis, bool is_int4) {
+  assert(scales.size() == offsets.size());  // Logic error if sizes don't match.
+  const uint32_t num_elems = static_cast<uint32_t>(scales.size());
+  params_.encodingDefinition = QNN_DEFINITION_DEFINED;
+
+  if (is_int4) {
+    params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
+    params_.bwAxisScaleOffsetEncoding.numElements = num_elems;
+    params_.bwAxisScaleOffsetEncoding.axis = axis;
+    params_.bwAxisScaleOffsetEncoding.bitwidth = 4;
+
+    // Deep copy to the scales[] and offsets[] arrays
+    if (num_elems > 0) {
+      const size_t num_scale_bytes = num_elems * sizeof(float);
+      const size_t num_zp_bytes = num_elems * sizeof(int32_t);
+      const size_t num_bytes = num_scale_bytes + num_zp_bytes;
+      constexpr std::uintptr_t align = alignof(float);
+      static_assert(alignof(float) == alignof(int32_t));
+
+      per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
+      char* scales_begin = ALIGN_PTR_UP(per_channel_data_.get(), align, char*);
+      char* zps_begin = scales_begin + num_scale_bytes;
+
+      std::memcpy(scales_begin, scales.data(), num_scale_bytes);
+      std::memcpy(zps_begin, offsets.data(), num_zp_bytes);
+      params_.bwAxisScaleOffsetEncoding.scales = reinterpret_cast<float*>(scales_begin);
+      params_.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast<int32_t*>(zps_begin);
+    } else {
+      params_.bwAxisScaleOffsetEncoding.scales = nullptr;
+      params_.bwAxisScaleOffsetEncoding.offsets = nullptr;
+    }
+  } else {
+    params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
+    params_.axisScaleOffsetEncoding.numScaleOffsets = num_elems;
+    params_.axisScaleOffsetEncoding.axis = axis;
+
+    // Deep copy to the scaleOffset data.
+    if (num_elems > 0) {
+      const size_t num_bytes = num_elems * sizeof(Qnn_ScaleOffset_t);
+      constexpr std::uintptr_t align = alignof(Qnn_ScaleOffset_t);
+      per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
+      Qnn_ScaleOffset_t* aligned_dst = ALIGN_PTR_UP(per_channel_data_.get(), align, Qnn_ScaleOffset_t*);
+
+      for (size_t i = 0; i < static_cast<uint32_t>(num_elems); i++) {
+        aligned_dst[i].offset = offsets[i];
+        aligned_dst[i].scale = scales[i];
+      }
+
+      params_.axisScaleOffsetEncoding.scaleOffset = aligned_dst;
+    } else {
+      params_.axisScaleOffsetEncoding.scaleOffset = nullptr;
+    }
+  }
+}
+
+// Get a copy of scales. Works for both per-tensor and per-channel.
+Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector<float>& scales) const {
+  ORT_RETURN_IF_NOT(params_.encodingDefinition == QNN_DEFINITION_DEFINED, "Unquantized qparams does not have scales");
+
+  switch (params_.quantizationEncoding) {
+    case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET:
+      scales.resize(1);
+      scales[0] = params_.scaleOffsetEncoding.scale;
+      break;
+    case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET:
+      scales.resize(1);
+      scales[0] = params_.bwScaleOffsetEncoding.scale;
+      break;
+    case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
+      const uint32_t num_elems = params_.axisScaleOffsetEncoding.numScaleOffsets;
+      scales.resize(num_elems);
+
+      if (num_elems > 0) {
+        gsl::span<const Qnn_ScaleOffset_t> scale_offsets(params_.axisScaleOffsetEncoding.scaleOffset, num_elems);
+
+        for (size_t i = 0; i < num_elems; i++) {
+          scales[i] = scale_offsets[i].scale;
+        }
+      }
+      break;
+    }
+    case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
+      const uint32_t num_elems = params_.bwAxisScaleOffsetEncoding.numElements;
+      scales.resize(num_elems);
+
+      // Deep copy the scales[] and offsets[] arrays
+      if (num_elems > 0) {
+        gsl::span<const float> src_scales(params_.bwAxisScaleOffsetEncoding.scales, num_elems);
+        for (size_t i = 0; i < num_elems; i++) {
+          scales[i] = src_scales[i];
+        }
+      }
+      break;
+    }
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ",
+                             params_.quantizationEncoding);
+  }
+
+  return Status::OK();
+}
+
 QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const {
   return QnnQuantParamsWrapper(*this);
 }
@@ -199,7 +304,7 @@ Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper, con
 
     params_.encodingDefinition = QNN_DEFINITION_DEFINED;
     params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
-    params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(*(ort_quant_params->axis));
+    params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
     params_.bwAxisScaleOffsetEncoding.bitwidth = 4;
     params_.bwAxisScaleOffsetEncoding.numElements = static_cast<uint32_t>(num_elems);
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index d1f93e5a692bc..23330f5616d73 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -3,6 +3,7 @@
 
 #pragma once
 #include <memory>
+#include <vector>
 #include "QnnTypes.h"
 #include "core/common/common.h"
 #include <gsl/gsl>
@@ -26,6 +27,9 @@ class QnnQuantParamsWrapper {
   // Construct a per-tensor quantization param (SCALE_OFFSET)
   QnnQuantParamsWrapper(float scale, int32_t offset);
 
+  // Construct a per-channel quantization param.
+  QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets, int32_t axis, bool is_int4);
+
   Qnn_QuantizeParams_t& Get() { return params_; }
   const Qnn_QuantizeParams_t& Get() const { return params_; }
 
@@ -54,6 +58,9 @@ class QnnQuantParamsWrapper {
             (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET));
   }
 
+  // Get a copy of scales. Works for both per-tensor and per-channel.
+  Status GetScales(/*out*/ std::vector<float>& scales) const;
+
   // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
   // must be transposed using the inverse permutation of the Transpose.
   template <typename IntType>
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index b07951d2a2e6d..99636976b9c05 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -178,10 +178,14 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
     ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
     std::vector<float> weight_scales;
     std::vector<WeightQType> weight_zero_points;
+    TensorShape weights_shape = weights_def.GetTensorShape();
+    int64_t pos_weight_quant_axis = weight_quant_axis;
+    if (pos_weight_quant_axis < 0) {
+      pos_weight_quant_axis += static_cast<int64_t>(weights_shape.NumDimensions());
+    }
     GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
-                                                   static_cast<size_t>(weight_quant_axis), true);
+                                                   static_cast<size_t>(pos_weight_quant_axis), true);
 
-    TensorShape weights_shape = weights_def.GetTensorShape();
     std::vector<WeightQType> quantized_weights;
     size_t num_weight_storage_elems = weights_shape.Size();
     if constexpr (std::is_same_v<WeightQType, Int4x2> || std::is_same_v<WeightQType, UInt4x2>) {
@@ -189,7 +193,7 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
     }
     quantized_weights.resize(num_weight_storage_elems);
     QuantizeValues<float, WeightQType>(weights_def.GetRawData(), quantized_weights, weights_shape,
-                                       weight_scales, weight_zero_points, weight_quant_axis);
+                                       weight_scales, weight_zero_points, pos_weight_quant_axis);
 
     NodeArg* weights_initializer = builder.MakeInitializer<WeightQType>(weights_def.GetShape(), quantized_weights);
     NodeArg* weights_dq = builder.MakeIntermediate();
@@ -760,6 +764,34 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
                                               21);    // opset
 }
 
+// Test per-channel QDQ Conv with INT4 weights and a negative weight quantization axis that still points to dimension 0.
+TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) {
+  std::vector<int64_t> input_shape = {1, 2, 4, 4};
+  std::vector<int64_t> weight_shape = {3, 2, 2, 2};
+  std::vector<int64_t> bias_shape = {3};
+
+  TestInputDef<float> input_def(input_shape, false,
+                                GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
+  TestInputDef<float> weight_def(weight_shape, true,
+                                 GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
+  TestInputDef<float> bias_def(bias_shape, true,
+                               GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
+
+  RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
+                                              input_def,
+                                              weight_def,
+                                              bias_def,
+                                              -4,            // negative weight quant axis (same as 0)
+                                              {1, 1},        // Strides
+                                              {0, 0, 0, 0},  // Pads
+                                              {1, 1},        // Dilations
+                                              1,             // default group
+                                              "NOTSET",
+                                              ExpectedEPNodeAssignment::All,
+                                              false,  // use_qdq_contrib_ops
+                                              21);    // opset
+}
+
 // Test per-channel QDQ Conv with INT4 weights. in0: u16, in1 (weight): s4, in2 (bias): s32, out: u8
 // TODO(adrianlizarraga): Investigate inaccuracy for QNN EP.
 //
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index 7d129dceca582..2af49a5e500d2 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -79,25 +79,53 @@ TEST_F(QnnCPUBackendTests, LayerNorm3D) {
 template <typename InputQType, typename ScaleQType>
 GetTestQDQModelFn<InputQType> BuildQDQLayerNormTestCase(const TestInputDef<float>& input_def,
                                                         const TestInputDef<float>& scale_def,
+                                                        const TestInputDef<float>& bias_def,
                                                         const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                                         bool use_contrib_qdq_ops) {
-  return [input_def, scale_def, attrs, use_contrib_qdq_ops](ModelTestBuilder& builder,
-                                                            std::vector<QuantParams<InputQType>>& output_qparams) {
+  return [input_def, scale_def, bias_def, attrs,
+          use_contrib_qdq_ops](ModelTestBuilder& builder,
+                               std::vector<QuantParams<InputQType>>& output_qparams) {
+    std::vector<NodeArg*> layer_norm_inputs;
+
     // input -> Q -> DQ ->
     NodeArg* input = MakeTestInput(builder, input_def);
     QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
     NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point,
                                                     use_contrib_qdq_ops);
+    layer_norm_inputs.push_back(input_qdq);
 
-    // scale input -> Q -> DQ ->
-    NodeArg* scale = MakeTestInput(builder, scale_def);
+    NodeArg* scale_qdq = nullptr;
     QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams<ScaleQType>(scale_def);
-    NodeArg* scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point,
-                                                    use_contrib_qdq_ops);
+
+    if (scale_def.IsInitializer() && scale_def.IsRawData()) {
+      // Quantized(scale weights) -> DQ ->
+      std::vector<float> scale_scales = {scale_qparams.scale};
+      std::vector<ScaleQType> scale_zps = {scale_qparams.zero_point};
+      TensorShape scale_shape = scale_def.GetTensorShape();
+      std::vector<ScaleQType> quantized_scales(scale_shape.Size());
+      QuantizeValues<float, ScaleQType>(scale_def.GetRawData(), quantized_scales, scale_shape,
+                                        scale_scales, scale_zps, std::nullopt);
+
+      NodeArg* scale_initzer = builder.MakeInitializer<ScaleQType>(scale_def.GetShape(), quantized_scales);
+      scale_qdq = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<ScaleQType>(scale_initzer, scale_scales, scale_zps, scale_qdq,
+                                                  nullptr, use_contrib_qdq_ops);
+    } else {
+      // scale input -> Q -> DQ ->
+      NodeArg* scale = MakeTestInput(builder, scale_def);
+      scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point,
+                                             use_contrib_qdq_ops);
+    }
+    layer_norm_inputs.push_back(scale_qdq);
+
+    if (!bias_def.GetShape().empty()) {
+      const float bias_scale = input_qparams.scale * scale_qparams.scale;
+      layer_norm_inputs.push_back(MakeTestQDQBiasInput(builder, bias_def, bias_scale, use_contrib_qdq_ops));
+    }
 
     // LayerNormalization
     NodeArg* layer_norm_output = builder.MakeIntermediate();
-    Node& layer_norm_node = builder.AddNode("LayerNormalization", {input_qdq, scale_qdq}, {layer_norm_output});
+    Node& layer_norm_node = builder.AddNode("LayerNormalization", layer_norm_inputs, {layer_norm_output});
 
     for (const auto& attr : attrs) {
       layer_norm_node.AddAttributeProto(attr);
@@ -114,6 +142,7 @@ GetTestQDQModelFn<InputQType> BuildQDQLayerNormTestCase(const TestInputDef<float
 template <typename InputQType, typename ScaleQType>
 static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
                                 const TestInputDef<float>& scale_def,
+                                const TestInputDef<float>& bias_def,
                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                 ExpectedEPNodeAssignment expected_ep_assignment,
                                 bool use_contrib_qdq_ops = false) {
@@ -125,7 +154,7 @@ static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
 #endif
 
   TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
-                       BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, attrs,
+                       BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, bias_def, attrs,
                                                                          use_contrib_qdq_ops),
                        provider_options,
                        17,  // opset
@@ -136,6 +165,7 @@ static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
 TEST_F(QnnHTPBackendTests, LayerNorm1D_Axis0_Unsupported) {
   RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, 0.0f, 10.0f),
                                         TestInputDef<float>({1, 2, 3}, true, 0.0f, 10.0f),
+                                        TestInputDef<float>(),
                                         {utils::MakeAttribute("axis", static_cast<int64_t>(0))},  // Unsupported axis
                                         ExpectedEPNodeAssignment::None);
 }
@@ -143,16 +173,40 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_Axis0_Unsupported) {
 // Test accuracy of 8-bit QDQ LayerNorm with a static scale input.
 TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU8_WU8) {
   RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
-                                        TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Static
-                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},            // Last axis
+                                        TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),
+                                        TestInputDef<float>(),  // Implicit bias input
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
                                         ExpectedEPNodeAssignment::All);
 }
 
+// Test accuracy of 8-bit QDQ LayerNorm with a static scale input and an explicit bias input (static).
+TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_StaticBias_AU8_WU8_BU8) {
+  RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
+                                        TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),
+                                        TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                                        ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnHTPBackendTests, LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) {
+  // QNN 2.24 LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide an
+  // explicit bias of all zeros to get around this bug.
+  for (size_t i = 0; i < 15; i++) {  // Run it multiple times since this is an intermittent bug.
+    RunLayerNormQDQTest<uint16_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 1.0f, 6)),
+                                           TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),
+                                           TestInputDef<float>(),  // Implicit bias input
+                                           {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},
+                                           ExpectedEPNodeAssignment::All,
+                                           true);
+  }
+}
+
 // Test accuracy of 16-bit QDQ LayerNorm with a static scale input.
 TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
   RunLayerNormQDQTest<uint16_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                                          TestInputDef<float>({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Static
-                                         {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},            // Last axis
+                                         TestInputDef<float>(),
+                                         {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},  // Last axis
                                          ExpectedEPNodeAssignment::All,
                                          true);  // Use 'com.microsoft' Q/DQ ops
 }
@@ -174,7 +228,8 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
 TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_DynamicScale) {
   RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                                         TestInputDef<float>({3}, false, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Dynamic
-                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},             // Last axis
+                                        TestInputDef<float>(),
+                                        {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},  // Last axis
                                         ExpectedEPNodeAssignment::All);
 }
 
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index ad54e644af3f7..eb03270dc8461 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -517,6 +517,9 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
   ASSERT_STATUS_OK(f32_model.MainGraph().Resolve());
   f32_model.ToProto().SerializeToString(&f32_model_data);
 
+  // Uncomment to save f32 model to disk for debugging.
+  // ASSERT_STATUS_OK(onnxruntime::Model::Save(f32_model, ToPathString("cmp_accuracy.f32.onnx")));
+
   // Run f32 model on CPU EP and collect outputs.
   std::vector<OrtValue> cpu_f32_outputs;
   InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All,
@@ -556,6 +559,9 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe
   ASSERT_STATUS_OK(qdq_model.MainGraph().Resolve());
   qdq_model.ToProto().SerializeToString(&qdq_model_data);
 
+  // Uncomment to save QDQ model to disk for debugging.
+  // ASSERT_STATUS_OK(onnxruntime::Model::Save(qdq_model, ToPathString("cmp_accuracy.qdq.onnx")));
+
   bool is_qnn_ep = true;
   TryEnableQNNSaver(qnn_options);
   std::vector<OrtValue> qnn_qdq_outputs;
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index f7dc5779ec5d9..2ebc2c6251b44 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -304,7 +304,7 @@ TEST_F(QnnHTPBackendTests, DISABLE_UnaryOp_Elu_U16) {
 // Expected val: 0
 // QNN QDQ val: -10 (err 10)
 // CPU QDQ val: 0 (err 0)
-TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Relu) {
+TEST_F(QnnHTPBackendTests, UnaryOp_Relu) {
   RunQDQOpTest<uint8_t>("Relu",
                         {TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))},
                         {},
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index a4a3d0e6b334b..6649206c0d79c 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 700326fe9173c..2eb7046d80e7a 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -62,7 +62,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 resources:
   repositories:
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 29ebf67dd3f91..0d67b0947be53 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 8d1b6b7854e50..cd3966633d742 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 trigger: none
 
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index a8b12637b70f3..7229bc5dbd114 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
index ada3603ae8476..734ad43e0066d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.23.0.240531'
+    default: '2.24.0.240626'
 
 steps:
   - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
index 3a68803896ab3..900adc9690255 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.23.0.240531'
+    default: '2.24.0.240626'
 
 steps:
   - powershell: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 17e64a207be2f..447e35244eb66 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -63,7 +63,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index 70221976d978f..40e8583141df8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 - name: PYTHON_VERSION
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 1bf5db5ae6d9a..33335bb2be2dd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index b4c4f36c5dcc6..944745b69ca63 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -1,5 +1,5 @@
 parameters:
-  QnnSdk: '2.23.0.240531'
+  QnnSdk: '2.24.0.240626'
   build_config: 'RelWithDebInfo'  
   IsReleaseBuild: false
   DoEsrp: false
@@ -103,7 +103,7 @@ stages:
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages and create project.assets.json'
         inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
           platform: 'Any CPU'
           configuration: ${{ parameters.build_config }}
           msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
@@ -112,7 +112,7 @@ stages:
       - task: MSBuild@1
         displayName: 'Build C# bindings'
         inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
           platform: 'Any CPU'
           configuration: ${{ parameters.build_config }}
           msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 97745fd09fbf7..e1b8b718e9928 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 2ab81e16cd57e..97c4ab15095c9 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.23.0.240531
+  default: 2.24.0.240626
 
 jobs:
 - job: 'build'

From c203d89958b4b0b23ce6967decc48afdd06b7ddb Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 24 Jul 2024 11:50:11 -0700
Subject: [PATCH 11/57] Update ruff and clang-format versions (#21479)

ruff -> 0.5.4
clang-format -> 18
---
 cgmanifests/generate_cgmanifest.py            |    2 +-
 .../tools/ValidateNativeDelegateAttributes.py |    2 +-
 include/onnxruntime/core/common/exceptions.h  |    6 +-
 .../core/framework/stream_handles.h           |    2 +-
 include/onnxruntime/core/platform/Barrier.h   |    2 +-
 .../platform/EigenNonBlockingThreadPool.h     |   10 +-
 .../core/providers/custom_op_context.h        |    2 +-
 .../experimental_onnxruntime_cxx_api.h        |    6 +-
 .../core/session/onnxruntime_cxx_api.h        |    4 +-
 .../core/session/onnxruntime_lite_custom_op.h |    2 +-
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |  134 +-
 .../contrib_ops/cpu/crop_and_resize.cc        |    2 +-
 .../cuda/bert/flash_attention/alibi.h         |    2 +-
 .../cuda/bert/flash_attention/mask.h          |    2 +-
 .../cuda/bert/flash_attention/softmax.h       |    2 +-
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |  364 +-
 onnxruntime/core/framework/ex_lib_loader.h    |    2 +-
 .../core/graph/contrib_ops/contrib_defs.cc    |    4 +-
 .../transpose_optimization/optimizer_api.h    |    8 +-
 onnxruntime/core/platform/path_lib.h          |    6 +-
 .../core/providers/coreml/model/model.mm      |    2 +-
 .../providers/cpu/cpu_execution_provider.cc   | 3092 ++++++++---------
 .../cpu/ml/tree_ensemble_classifier.cc        |   18 +-
 .../core/providers/cpu/ml/treeregressor.cc    |   18 +-
 .../object_detection/non_max_suppression.cc   |    4 +-
 .../cpu/object_detection/roialign.cc          |    6 +-
 .../core/providers/cpu/tensor/expand.cc       |   10 +-
 .../providers/cuda/cuda_execution_provider.cc | 1788 +++++-----
 onnxruntime/core/providers/cuda/cuda_graph.h  |    4 +-
 .../core/providers/cuda/cuda_profiler.h       |    2 +-
 .../core/providers/cuda/nn/conv_transpose.h   |    2 +-
 .../core/providers/cuda/nvtx_profile.h        |    8 +-
 .../providers/cuda/shared_inc/cuda_utils.h    |    2 +-
 .../core/providers/cuda/tensor/cast_op.cc     |   32 +-
 .../providers/dnnl/dnnl_node_capability.h     |    2 +-
 .../providers/dnnl/subgraph/dnnl_subgraph.h   |    2 +-
 onnxruntime/core/providers/js/allocator.h     |    2 +-
 onnxruntime/core/providers/js/data_transfer.h |    4 +-
 .../builder/opbuilder/expand_op_builder.cc    |    2 +-
 .../qnn/builder/opbuilder/pad_op_builder.cc   |    2 +-
 .../qnn/builder/qnn_quant_params_wrapper.cc   |    2 +-
 .../core/providers/rocm/rocm_profiler.h       |    2 +-
 .../shared_library/provider_host_api.h        |    4 +-
 .../tensorrt_execution_provider_custom_ops.h  |    4 +-
 .../vitisai/vitisai_provider_factory.cc       |    2 +-
 .../builders/impl/elementwise_op_builder.h    |    2 +-
 .../vsinpu/builders/op_builder_factory.h      |    7 +-
 .../python/onnxruntime_pybind_iobinding.cc    |    9 +-
 .../python/onnxruntime_pybind_ortvalue.cc     |  136 +-
 .../onnxruntime_pybind_sparse_tensor.cc       |    3 +-
 .../python/onnxruntime_pybind_state.cc        |   88 +-
 onnxruntime/python/onnxruntime_validation.py  |   17 +-
 .../tools/pytorch_export_contrib_ops.py       |    2 +-
 .../python/tools/quantization/calibrate.py    |    2 +-
 .../python/tools/symbolic_shape_infer.py      |   44 +-
 .../python/tools/tensorrt/perf/benchmark.py   |   12 +-
 .../python/tools/tensorrt/perf/perf_utils.py  |    2 +-
 .../perf/setup_scripts/setup_onnx_zoo.py      |    2 +-
 .../python/tools/transformers/benchmark.py    |    2 +-
 .../tools/transformers/bert_test_data.py      |    6 +-
 .../tools/transformers/fusion_attention.py    |    2 +-
 .../python/tools/transformers/fusion_utils.py |    2 +-
 .../bart/utils/export_summarization_edinit.py |    2 +-
 .../export_summarization_enc_dec_past.py      |    2 +-
 .../models/bart/utils/onnx_inference.py       |    4 +-
 .../models/stable_diffusion/engine_builder.py |    2 +-
 .../pipeline_stable_diffusion.py              |    4 +-
 .../test/framework/allocation_planner_test.cc |    2 +-
 onnxruntime/test/onnx/OrtValueList.h          |    2 +-
 .../test/onnx/microbenchmark/activation.cc    |    2 +-
 .../qdq_transformer_fastmath_test.cc          |    2 +-
 .../test/optimizer/qdq_transformer_test.cc    |    2 +-
 .../reduction_test_cases_generator.py         |    8 +-
 .../test/providers/cpu/tensor/pad_test.cc     |   12 +-
 .../test/providers/qnn/qnn_basic_test.cc      |   10 +-
 .../test/python/onnx_backend_test_series.py   |    2 +-
 .../test/python/transformers/rotary_flash.py  |    3 -
 .../generate_tiny_keras2onnx_bert_models.py   |    4 +-
 .../generate_tiny_gpt2_model.py               |    4 +-
 onnxruntime/test/shared_lib/custom_op_utils.h |   20 +-
 onnxruntime/test/testdata/CNTK/gen.py         |    4 +-
 .../core/framework/adasum/adasum_mpi.cc       |    3 +-
 .../orttraining/core/framework/pipeline.h     |    2 +-
 .../torch/custom_function_register.h          |    2 +-
 .../orttraining/core/framework/torch/gil.h    |    2 +-
 .../core/framework/torch/torch_proxy.h        |    4 +-
 .../orttraining/core/graph/graph_augmenter.h  |    6 +-
 .../core/graph/loss_func/loss_func_common.h   |    2 +-
 .../core/graph/pipeline_transformer.cc        |    2 +-
 .../core/optimizer/megatron_transformer.cc    |    4 +-
 .../core/session/training_session.h           |    6 +-
 orttraining/orttraining/lazy_tensor/flags.h   |    2 +-
 orttraining/orttraining/models/bert/main.cc   |    3 +-
 .../orttraining/models/pipeline_poc/main.cc   |   54 +-
 .../orttraining/models/runner/training_util.h |    4 +-
 .../python/orttraining_pybind_state.cc        |   24 +-
 .../python/training/ort_triton/kernel/_mm.py  |    2 +-
 .../python/training/ortmodule/_utils.py       |    2 +-
 .../cpu/torch_interop_utils/ctx_pool.h        |    4 +-
 .../test/distributed/partition_utils.h        |    2 +-
 ...orttraining_test_hierarchical_ortmodule.py |    2 +-
 .../orttraining_test_model_transform.py       |    2 +-
 .../python/orttraining_test_ortmodule_api.py  |   12 +-
 ...training_test_ortmodule_bert_classifier.py |    2 +-
 ...test_ortmodule_bert_classifier_autocast.py |    2 +-
 ...g_test_ortmodule_deepspeed_zero_stage_1.py |    2 +-
 .../orttraining_test_ortmodule_onnx_ops.py    |   12 +-
 .../python/orttraining_test_ortmodule_poc.py  |    2 +-
 .../test/python/orttraining_test_utilities.py |    4 +-
 .../training_ops/function_op_test_utils.cc    |    2 +-
 .../cpu/torch/torch_custom_function_kernel.h  |    2 +-
 .../cuda/cuda_training_kernels.cc             |  466 +--
 .../rocm/rocm_training_kernels.cc             |  374 +-
 .../tools/scripts/gpt2_model_transform.py     |    2 +-
 orttraining/tools/scripts/model_transform.py  |    2 +-
 pyproject.toml                                |    1 +
 requirements-lintrunner.txt                   |    8 +-
 tools/ci_build/build.py                       |   22 +-
 tools/ci_build/gen_def.py                     |   10 +-
 tools/ci_build/reduce_op_kernels.py           |    2 +-
 tools/ci_build/replace_urls_in_deps.py        |    6 +-
 .../upload_python_package_to_azure_storage.py |    2 +-
 tools/doc/rename_folders.py                   |   14 +-
 .../nuget/generate_nuspec_for_native_nuget.py |    6 +-
 tools/python/onnx_test_data_utils.py          |    2 +-
 .../util/mobile_helpers/usability_checker.py  |    2 +-
 .../util/reduced_build_config_parser.py       |    2 +-
 winml/lib/Api.Image/CpuDetensorizer.h         |    9 +-
 winml/lib/Api.Image/CpuTensorizer.h           |   12 +-
 winml/lib/Api.Image/D3DDeviceCache.cpp        |   24 +-
 winml/lib/Api.Image/EventTimer.h              |    4 +-
 .../lib/Api.Image/ImageConversionHelpers.cpp  |   11 +-
 winml/lib/Api.Image/ImageConverter.cpp        |    3 +-
 .../Api.Image/TensorToVideoFrameConverter.cpp |   13 +-
 .../Api.Image/VideoFrameToTensorConverter.cpp |   38 +-
 .../Api.Image/inc/ConverterResourceStore.h    |    2 +-
 winml/lib/Api/FeatureValues.h                 |   82 +-
 winml/lib/Api/ImageFeatureValue.cpp           |   14 +-
 winml/lib/Api/LearningModel.cpp               |    2 +-
 winml/lib/Api/LearningModelSession.cpp        |    4 +-
 winml/lib/Api/NumericData.cpp                 |   12 +-
 winml/lib/Api/impl/FeatureCompatibility.h     |    6 +-
 winml/lib/Common/CommonDeviceHelpers.cpp      |    6 +-
 ...er_backed_random_access_stream_reference.h |    5 +-
 winml/test/api/raw/winml_microsoft.h          |  108 +-
 winml/test/api/raw/winml_windows.h            |  112 +-
 winml/test/image/imagetests.cpp               |   21 +-
 winml/test/model/compare_feature_value.cpp    |    3 +-
 winml/test/model/model_tests.cpp              |   10 +-
 winml/test/model/skip_model_tests.h           |    6 +-
 winml/test/scenario/cppwinrt/NoisyReluCpu.h   |    6 +-
 winml/test/scenario/cppwinrt/ReluCpu.h        |    6 +-
 152 files changed, 3781 insertions(+), 3842 deletions(-)

diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 3cecbb0cc977f..52bd3f58645f2 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -73,7 +73,7 @@ def add_github_dep(name, parsed_url):
             return
         # Make a REST call to convert to tag to a git commit
         url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}"
-        print("requesting %s ..." % url)
+        print("requesting {url} ...")
         res = requests.get(url, auth=(args.username, args.token))
         response_json = res.json()
         tag_object = response_json["object"]
diff --git a/csharp/tools/ValidateNativeDelegateAttributes.py b/csharp/tools/ValidateNativeDelegateAttributes.py
index acd6c173bfeb0..7431cc8d9d288 100644
--- a/csharp/tools/ValidateNativeDelegateAttributes.py
+++ b/csharp/tools/ValidateNativeDelegateAttributes.py
@@ -19,7 +19,7 @@ def check_all_delegates_have_unmanaged_function_pointer_attribute(file: pathlib.
     line_num = 0
     with open(str(file.resolve(strict=True))) as f:
         prev_line = ""
-        for line in f.readlines():
+        for line in f:
             line_num += 1
 
             # strip so it's easier to deal with commented out lines.
diff --git a/include/onnxruntime/core/common/exceptions.h b/include/onnxruntime/core/common/exceptions.h
index 18c117f12ad7d..494a770b8db98 100644
--- a/include/onnxruntime/core/common/exceptions.h
+++ b/include/onnxruntime/core/common/exceptions.h
@@ -17,13 +17,13 @@ namespace onnxruntime {
 
 class NotImplementedException : public std::logic_error {
  public:
-  explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){};
-  explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){};
+  explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
+  explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
 };
 
 class TypeMismatchException : public std::logic_error {
  public:
-  TypeMismatchException() noexcept : logic_error("Type mismatch"){};
+  TypeMismatchException() noexcept : logic_error("Type mismatch") {};
 };
 
 class OnnxRuntimeException : public std::exception {
diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
index 9c987f10ccadb..01631e1fb2aa6 100644
--- a/include/onnxruntime/core/framework/stream_handles.h
+++ b/include/onnxruntime/core/framework/stream_handles.h
@@ -32,7 +32,7 @@ class Stream {
     return {};
   };
   // block the host thread until all the tasks in the stream finished.
-  virtual void Flush(){};
+  virtual void Flush() {};
   // The framework may reuse the stream instance for multiple iterations.
   // This is the API that provide a chance to let the device stream cleanup
   // resource at the end of a iteration.
diff --git a/include/onnxruntime/core/platform/Barrier.h b/include/onnxruntime/core/platform/Barrier.h
index 915cfc50953ed..1148b052bd9af 100644
--- a/include/onnxruntime/core/platform/Barrier.h
+++ b/include/onnxruntime/core/platform/Barrier.h
@@ -76,6 +76,6 @@ class Barrier {
 // Multiple threads can wait on the same Notification object,
 // but only one caller must call Notify() on the object.
 struct Notification : Barrier {
-  Notification() : Barrier(1){};
+  Notification() : Barrier(1) {};
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index e33007102e198..d4411a6d72356 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -219,18 +219,18 @@ class ThreadPoolProfiler {
     WAIT_REVOKE,
     MAX_EVENT
   };
-  ThreadPoolProfiler(int, const CHAR_TYPE*){};
+  ThreadPoolProfiler(int, const CHAR_TYPE*) {};
   ~ThreadPoolProfiler() = default;
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);
-  void Start(){};
+  void Start() {};
   std::string Stop() { return "not available for minimal build"; }
-  void LogStart(){};
+  void LogStart() {};
   void LogEnd(ThreadPoolEvent){};
   void LogEndAndStart(ThreadPoolEvent){};
   void LogStartAndCoreAndBlock(std::ptrdiff_t){};
   void LogCoreAndBlock(std::ptrdiff_t){};
-  void LogThreadId(int){};
-  void LogRun(int){};
+  void LogThreadId(int) {};
+  void LogRun(int) {};
   std::string DumpChildThreadStat() { return {}; }
 };
 #else
diff --git a/include/onnxruntime/core/providers/custom_op_context.h b/include/onnxruntime/core/providers/custom_op_context.h
index 8f3d2476d4fdb..b10126da8e0fb 100644
--- a/include/onnxruntime/core/providers/custom_op_context.h
+++ b/include/onnxruntime/core/providers/custom_op_context.h
@@ -6,5 +6,5 @@
 // CustomOpContext defines an interface allowing a custom op to access ep-specific resources.
 struct CustomOpContext {
   CustomOpContext() = default;
-  virtual ~CustomOpContext(){};
+  virtual ~CustomOpContext() {};
 };
\ No newline at end of file
diff --git a/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h b/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h
index 9e4ceffc44bfd..c1a7839ff22fa 100644
--- a/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h
@@ -24,9 +24,9 @@ namespace Experimental {
 
 struct Session : Ort::Session {
   Session(Env& env, std::basic_string<ORTCHAR_T>& model_path, SessionOptions& options)
-      : Ort::Session(env, model_path.data(), options){};
+      : Ort::Session(env, model_path.data(), options) {};
   Session(Env& env, void* model_data, size_t model_data_length, SessionOptions& options)
-      : Ort::Session(env, model_data, model_data_length, options){};
+      : Ort::Session(env, model_data, model_data_length, options) {};
 
   // overloaded Run() with sensible defaults
   std::vector<Ort::Value> Run(const std::vector<std::string>& input_names,
@@ -52,7 +52,7 @@ struct Session : Ort::Session {
 
 struct Value : Ort::Value {
   Value(OrtValue* p)
-      : Ort::Value(p){};
+      : Ort::Value(p) {};
 
   template <typename T>
   static Ort::Value CreateTensor(T* p_data, size_t p_data_element_count, const std::vector<int64_t>& shape);
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 8091fd4cfc2a3..5d974e1ff5185 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2175,8 +2175,8 @@ struct Op : detail::Base<OrtOp> {
 /// </summary>
 struct ShapeInferContext {
   struct SymbolicInteger {
-    SymbolicInteger(int64_t i) : i_(i), is_int_(true){};
-    SymbolicInteger(const char* s) : s_(s), is_int_(false){};
+    SymbolicInteger(int64_t i) : i_(i), is_int_(true) {};
+    SymbolicInteger(const char* s) : s_(s), is_int_(false) {};
     SymbolicInteger(const SymbolicInteger&) = default;
     SymbolicInteger(SymbolicInteger&&) = default;
 
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index 57a64380faeb0..ce87d8c56d3fe 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -29,7 +29,7 @@ class ArgBase {
   ArgBase(OrtKernelContext* ctx,
           size_t indice,
           bool is_input) : ctx_(ctx), indice_(indice), is_input_(is_input) {}
-  virtual ~ArgBase(){};
+  virtual ~ArgBase() {};
 
  protected:
   struct KernelContext ctx_;
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 90a51fda0b188..84f9ca88ecf55 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -267,83 +267,83 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
 
 Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SampleOp)>,
 
-    // add more kernels here
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, WhisperBeamSearch)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Range)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WordConvEmbedding)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherND)>,
+      // add more kernels here
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Attention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, BeamSearch)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, WhisperBeamSearch)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GroupQueryAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SparseAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, RotaryEmbedding)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, AttnLSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, string, Tokenizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Range)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WordConvEmbedding)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherND)>,
 #if !defined(DISABLE_SPARSE_TENSORS)
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SparseToDenseMatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SparseToDenseMatMul)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MurmurHash3)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul)>,  // backward compatibility
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MurmurHash3)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, TransposeMatMul)>,  // backward compatibility
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FusedMatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulBnb4)>,
 #ifndef ORT_MINIMAL_BUILD
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MatMulFpQ4)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MaxpoolWithMask)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Unique)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CropAndResize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FastGelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NGramRepeatBlock)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BifurcationDetector)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
-    // These ops were experimental ops in onnx domain which have been removed now. We add them here as
-    // contrib ops to main backward compatibility
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Affine)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Crop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, DynamicSlice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ImageScaler)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ParametricSoftplus)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ScaledTanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, float, LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, double, LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MaxpoolWithMask)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Unique)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ConvTransposeWithDynamicPads)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CropAndResize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, FastGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NGramRepeatBlock)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BifurcationDetector)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
+      // These ops were experimental ops in onnx domain which have been removed now. We add them here as
+      // contrib ops to main backward compatibility
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Affine)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Crop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, DynamicSlice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ImageScaler)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, MeanVarianceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ParametricSoftplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, ScaledTanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, float, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 16, double, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipSimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
 
 #ifdef ENABLE_ATEN
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
 #endif
 
 #ifdef ENABLE_TRAINING_OPS
-    // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or
-    // 2). this is needed by inference for other purpose.
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
+      // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or
+      // 2). this is needed by inference for other purpose.
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
 #endif
 
   };
diff --git a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
index 1863522c1643c..533d62f5e7486 100644
--- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
+++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
@@ -173,7 +173,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
               }
             }
           }  // for pw
-        }    // for ph
+        }  // for ph
       },
       0);  // for n
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h
index 5d94190ecbeb9..18d36cfd88d60 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h
@@ -17,7 +17,7 @@ struct Alibi {
   const int max_seqlen_k, max_seqlen_q;
 
   __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
-      : alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q){};
+      : alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q) {};
 
   template <typename Engine, typename Layout>
   __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout>& tensor,
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
index b225e5e3be559..0998155eba635 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h
@@ -116,7 +116,7 @@ struct Mask {
   __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q,
                                   const int window_size_left, const int window_size_right,
                                   const float alibi_slope = 0.f)
-      : max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope){};
+      : max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {};
 
   // Causal_mask: whether this particular iteration needs causal masking
   template <bool Causal_mask = false, bool Is_even_MN = true, typename Engine, typename Layout>
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
index 3c205378f0177..ba678b740d376 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h
@@ -121,7 +121,7 @@ struct Softmax {
   using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
   TensorT row_max, row_sum;
 
-  __forceinline__ __device__ Softmax(){};
+  __forceinline__ __device__ Softmax() {};
 
   template <bool Is_first, bool Check_inf = false, typename Tensor0, typename Tensor1>
   __forceinline__ __device__ void softmax_rescale_o(Tensor0& acc_s, Tensor1& acc_o, float softmax_scale_log2) {
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index b237e5c24bbef..21bd5eb91c20f 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -231,206 +231,206 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 
 Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GridSample)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FastGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FastGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Gelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Gelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Gelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasSplitGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasSplitGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasAdd)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasAdd)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, QuickGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, QuickGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, QuickGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, TransposeMatMul)>,      // backward compatibility
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, TransposeMatMul)>,     // backward compatibility
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, TransposeMatMul)>,  // backward compatibility
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, FusedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FusedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RelativePositionBias)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RelativePositionBias)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GatedRelativePositionBias)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GatedRelativePositionBias)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RemovePadding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RemovePadding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RestorePadding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RestorePadding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Rfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Rfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Rfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Irfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Irfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Irfft)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMulConj)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMulConj)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, NGramRepeatBlock)>,
-
-    // These ops were experimental ops in onnx domain which have been removed now. We add them here as
-    // contrib ops to maintain backward compatibility
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Affine)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Affine)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Affine)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Attention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Attention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedMultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedMultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BeamSearch)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, WhisperBeamSearch)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ConvTransposeWithDynamicPads)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Crop)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Crop)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Crop)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MoE)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MoE)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QMoE)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderAttention)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int32_t, DynamicSlice)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int64_t, DynamicSlice)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, EmbedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, EmbedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GreedySearch)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GroupNorm)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, NhwcConv)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, NhwcConv)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ImageScaler)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ImageScaler)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ImageScaler)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, LongformerAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, LongformerAttention)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ParametricSoftplus)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ParametricSoftplus)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ParametricSoftplus)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RotaryEmbedding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RotaryEmbedding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, RotaryEmbedding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GemmaRotaryEmbedding)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Sampling)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ScaledTanh)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ScaledTanh)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ScaledTanh)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, SkipGroupNorm)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipSimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipSimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ThresholdedRelu)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ThresholdedRelu)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ThresholdedRelu)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_float, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, double_double_double, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_MLFloat16, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_MLFloat16, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_float, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, BFloat16_float_BFloat16, LayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_float, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double_double_double, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, BFloat16_float_BFloat16, SimplifiedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Inverse)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulNBits)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulNBits)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, MatMulBnb4)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulBnb4)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulBnb4)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasSoftmax)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasDropout)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskDropout)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskBiasDropout)>,
-
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, QuantizeLinear)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, QuantizeLinear)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, DequantizeLinear)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, DequantizeLinear)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_int8_t, QAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_int8_t, QAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, UnfoldTensor)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DynamicTimeWarping)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Trilu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FastGelu)>,
-    // TransposedMatMul is still here for backward compatibility
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, TransposeMatMul)>,  // backward compatibility
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FusedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedConv)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLayerNormalization)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedGelu)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QuantizeWithOrder)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DequantizeWithOrder)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLongformerAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedSelfAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedSelfAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedMultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GemmFloat8)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SparseAttention)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, SparseAttention)>,
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GridSample)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FastGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FastGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Gelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Gelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Gelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasSplitGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasSplitGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, BiasAdd)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, BiasAdd)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, QuickGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, QuickGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, QuickGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, TransposeMatMul)>,      // backward compatibility
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, TransposeMatMul)>,     // backward compatibility
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, TransposeMatMul)>,  // backward compatibility
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, FusedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, FusedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RelativePositionBias)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RelativePositionBias)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, GatedRelativePositionBias)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GatedRelativePositionBias)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RemovePadding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RemovePadding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RestorePadding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RestorePadding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Rfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Rfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Rfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Irfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, double, Irfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Irfft)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ComplexMulConj)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ComplexMulConj)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, NGramRepeatBlock)>,
+
+      // These ops were experimental ops in onnx domain which have been removed now. We add them here as
+      // contrib ops to maintain backward compatibility
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Affine)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Affine)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Affine)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, Attention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, Attention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, PackedMultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PackedMultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BeamSearch)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, WhisperBeamSearch)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ConvTransposeWithDynamicPads)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, Crop)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, Crop)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, Crop)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MoE)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MoE)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QMoE)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderAttention)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int32_t, DynamicSlice)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, int64_t, DynamicSlice)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, EmbedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, EmbedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GreedySearch)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GroupNorm)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, NhwcConv)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, NhwcConv)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ImageScaler)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ImageScaler)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ImageScaler)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, LongformerAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, LongformerAttention)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ParametricSoftplus)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ParametricSoftplus)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ParametricSoftplus)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, RotaryEmbedding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, RotaryEmbedding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, RotaryEmbedding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GemmaRotaryEmbedding)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Sampling)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ScaledTanh)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ScaledTanh)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ScaledTanh)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, SkipGroupNorm)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, SkipSimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SkipSimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float, ThresholdedRelu)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double, ThresholdedRelu)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16, ThresholdedRelu)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_float, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, double_double_double, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_MLFloat16, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, float_float_MLFloat16, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, MLFloat16_float_float, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_VERSIONED_TYPED_CLASS_NAME(1, 16, BFloat16_float_BFloat16, LayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_float, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, double_double_double, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, float_float_MLFloat16, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, MLFloat16_float_float, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_ONNX_OP_TYPED_CLASS_NAME(1, BFloat16_float_BFloat16, SimplifiedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Inverse)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulNBits)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulNBits)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, MatMulBnb4)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, MatMulBnb4)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, MatMulBnb4)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasSoftmax)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BiasDropout)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskDropout)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, BitmaskBiasDropout)>,
+
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int8_t_MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, uint8_t_MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_int8_t, QAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_int8_t, QAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, UnfoldTensor)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DynamicTimeWarping)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, Trilu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FastGelu)>,
+      // TransposedMatMul is still here for backward compatibility
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, TransposeMatMul)>,  // backward compatibility
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, FusedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, FusedConv)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLayerNormalization)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedGelu)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QuantizeWithOrder)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, DequantizeWithOrder)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, QOrderedLongformerAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedSelfAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedSelfAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DecoderMaskedMultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DecoderMaskedMultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, GemmFloat8)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, SparseAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, SparseAttention)>,
 
 #ifdef ENABLE_ATEN
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
 #endif
 
 #ifdef ENABLE_TRAINING_OPS
-    // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once
-    // 1). compute optimizer is enabled for inference or
-    // 2). this is needed by inference for other purpose.
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, ShrunkenGather)>,
+      // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once
+      // 1). compute optimizer is enabled for inference or
+      // 2). this is needed by inference for other purpose.
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, ShrunkenGather)>,
 #endif
 
 #if defined(ORT_USE_NCCL)
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllReduce)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllGather)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllToAll)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllReduce)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllGather)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_CLASS_NAME(1, AllToAll)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ShardedMoE)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ShardedMoE)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, ShardedMoE)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, ShardedMoE)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedMatMul)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedMatMul)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedMatMul)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSlice)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSlice)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSlice)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSlice)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedReshape)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReshape)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReshape)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedReshape)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReshape)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReshape)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedExpand)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedExpand)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedExpand)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedExpand)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedExpand)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedExpand)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceSum)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceSum)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceSum)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceSum)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMax)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMax)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMax)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMax)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMean)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMean)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedReduceMean)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedReduceMean)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedUnsqueeze)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedUnsqueeze)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedUnsqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedUnsqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedUnsqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedUnsqueeze)>,
 
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedSqueeze)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSqueeze)>,
-    BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, int64_t, DistributedSqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float, DistributedSqueeze)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, DistributedSqueeze)>,
 #endif
   };
 
diff --git a/onnxruntime/core/framework/ex_lib_loader.h b/onnxruntime/core/framework/ex_lib_loader.h
index cc353a7521786..d7ea5db3e5a26 100644
--- a/onnxruntime/core/framework/ex_lib_loader.h
+++ b/onnxruntime/core/framework/ex_lib_loader.h
@@ -20,7 +20,7 @@ class ExLibLoader {
   virtual ~ExLibLoader();
 
  protected:
-  virtual void PreUnloadLibrary(void* /*handle*/){};
+  virtual void PreUnloadLibrary(void* /*handle*/) {};
 
   std::map<std::string, void*> dso_name_data_map_;
 
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index dea8775c89a30..2d51658953282 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -2665,10 +2665,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(CropAndResize, 1,
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 #define GEMM_FLOAT8_TYPES \
-  { "tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)" }
+  {"tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)"}
 #else
 #define GEMM_FLOAT8_TYPES \
-  { "tensor(float16)", "tensor(bfloat16)", "tensor(float)" }
+  {"tensor(float16)", "tensor(bfloat16)", "tensor(float)"}
 #endif
 
 ONNX_MS_OPERATOR_SET_SCHEMA(GemmFloat8, 1,
diff --git a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
index c042bb0059ac2..e7d2d32809fc5 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
+++ b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h
@@ -86,7 +86,7 @@ class TensorRef {
   /// <returns>Flattened tensor data in bytes</returns>
   virtual std::vector<uint8_t> Data() const = 0;
 
-  virtual ~TensorRef(){};
+  virtual ~TensorRef() {};
 };
 
 /// <summary>
@@ -131,7 +131,7 @@ class ValueInfoRef {
   /// <param name="axes">Indices of dimensions to add. Indices are relative to final shape.</param>
   virtual void UnsqueezeDims(const std::vector<int64_t>& axes) = 0;
 
-  virtual ~ValueInfoRef(){};
+  virtual ~ValueInfoRef() {};
 };
 
 /// <summary>
@@ -248,7 +248,7 @@ class NodeRef {
   /// <returns>Id</returns>
   virtual int64_t Id() const = 0;
 
-  virtual ~NodeRef(){};
+  virtual ~NodeRef() {};
 };
 
 /// <summary>
@@ -449,7 +449,7 @@ class GraphRef {
   /// <returns>True if output of the Graph.</returns>
   virtual bool IsGraphOutput(std::string_view name) const = 0;
 
-  virtual ~GraphRef(){};
+  virtual ~GraphRef() {};
 };
 
 }  // namespace api
diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h
index fca8990f14821..94425a3999d42 100644
--- a/onnxruntime/core/platform/path_lib.h
+++ b/onnxruntime/core/platform/path_lib.h
@@ -228,11 +228,9 @@ inline std::basic_string<PATH_CHAR_TYPE> GetLastComponent(const std::basic_strin
   typename std::basic_string<PATH_CHAR_TYPE>::size_type pos = input.length();
   PATH_CHAR_TYPE sep = GetPathSep<PATH_CHAR_TYPE>();
   // remove trailing backslash
-  for (; pos > 1 && input[pos - 1] == sep; --pos)
-    ;
+  for (; pos > 1 && input[pos - 1] == sep; --pos);
   input.resize(pos);
-  for (; pos != 0 && input[pos - 1] != sep; --pos)
-    ;
+  for (; pos != 0 && input[pos - 1] != sep; --pos);
   return input.substr(pos);
 }
 
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 4fd822f0d0d15..4d20061820e71 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -502,7 +502,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 class Execution {
  public:
   Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
-  ~Execution(){};
+  ~Execution() {};
 
   Status LoadModel();
   Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 9147107ac518a..7ac68e3a9a69d 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1133,1568 +1133,1568 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 
 Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Elu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
-                                                                    LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softplus)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Multinomial)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
-                                                                          int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          uint8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          uint16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          uint32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          uint64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int32_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          int64_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 11, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
-                                                                          double, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
-                                                                          float, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
-                                                                          double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          float, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
-                                                                          float, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
-                                                                          float, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Not)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, And)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Or)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Xor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          float, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          double, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
-                                                                          bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
-                                                                          int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
-                                                                          int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
-                                                                          float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
-                                                                          double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
-                                                                          float, Mean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          float, Mean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Sin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Cos)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Tan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Asin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Acos)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Hardmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
-                                                                          float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
-                                                                          double, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
-                                                                          float, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
-                                                                          double, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6,
-                                                          InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float,
-                                                                LpNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double,
-                                                                LpNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
-                                                                    AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
-                                                                    MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, LpPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, GlobalLpPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MaxRoiPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          double, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int64_t, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          uint8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                          int32_t, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
-                                                                    Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12,
-                                                                    Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
-                                                                    SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
-                                                                    Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          float, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          int32_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          int8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                          uint8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          float, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          double, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          int8_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          int16_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          int32_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          int64_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          uint8_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          uint16_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          uint32_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          uint64_t, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          bool, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          MLFloat16, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
-                                                                          string, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
-
-    // Opset 9
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19,
-                                                                    ConstantOfShape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                    MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          double, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int32_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int64_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, EyeLike)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          double, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          MLFloat16, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_int64_t_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          float_int64_t_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_string_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          float_string_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          float_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_int32_t_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_float_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int32_t_float_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int32_t_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          int64_t_float_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    MaxUnpool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sinh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Cosh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Asinh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    Scatter)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, TfIdfVectorizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          bool, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int32_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int64_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          uint8_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          string, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          float, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          double, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          int32_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          int64_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
-                                                                          uint8_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                          double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          double, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int32_t, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                          int64_t, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
-                                                                          float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
-                                                                          double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
-                                                                          float, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
-                                                                          int32_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
-                                                                          int8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
-                                                                          uint8_t, Upsample)>,
-
-    // Opset 10
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, StringNormalizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          float, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          double, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                    AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                          uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
-                                                                          uint8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
-                                                                          int8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
-                                                                          int32_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
-                                                                          uint8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
-                                                                          int8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
-                                                                QLinearMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
-                                                                QLinearMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
-                                                                MatMulInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
-                                                                MatMulInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ConvInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
-                                                                QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
-                                                                QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 11,
-                                                                    Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                    NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
-                                                                          float, RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
-                                                                          double, RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
-    // opset 11
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
-                                                                          float, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
-                                                                          double, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
-                                                                          int32_t, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
-                                                                          int64_t, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
-                                                                Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
-                                                                DynamicQuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          uint8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Hardmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Det)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
-                                                                    AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17,
-                                                                    LpPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceAt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SplitToSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                    GatherElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
-                                                                BitShift)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t,
-                                                                BitShift)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint64_t,
-                                                                BitShift)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                    GatherND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Range)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Unique)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_int64_t_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                float_int64_t_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_string_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                float_string_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                float_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_int32_t_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_float_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int32_t_float_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int32_t_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_float_float, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                                int64_t_float_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                          int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          float, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int32_t, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          double, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
-                                                                          int64_t, ReduceSumSquare)>,
-
-    // OpSet 12
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Clip)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Min)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Max)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Pow)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MaxPool)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int8_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          uint8_t, ReduceMax)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                    GatherND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Einsum)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string,
-                                                                Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf)>,
-    // REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
-    // However these types work on GPU implementation.
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
-    // MLFloat16_MLFloat16, Dropout)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
-    // MLFloat16_float, Dropout)>, BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider,
-    // kOnnxDomain, 12, MLFloat16_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          float_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          float_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          double_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
-                                                                          double_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          float, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          double, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          int32_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          int64_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          float, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          double, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          int32_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
-                                                                          int64_t, LessOrEqual)>,
-
-    // opset 13
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          uint8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          int8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          int32_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          uint8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          int8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
-                                                                    Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                          MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float,
-                                                                Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double,
-                                                                Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float,
-                                                                Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double,
-                                                                Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
-                                                                ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
-                                                                ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                    Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
-                                                                          double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Mod)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                          double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
-                                                                    Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
-                                                                    Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
-                                                                    Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
-                                                                    ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
-                                                                    ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
-                                                                    Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
-                                                                          float, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
-                                                                          double, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
-                                                                          MLFloat16, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
-                                                                NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          double, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int8_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          uint8_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          double, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          double, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int64_t, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
-                                                                          uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Hardmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Softmax)>,
-
-    // OpSet 14
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
-                                                                CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, Trilu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18,
-                                                                    Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15,
-                                                                    Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
-                                                                          float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
-                                                                          double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN)>,
-
-    // Opset 15
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, float,
-                                                                BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, double,
-                                                                BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Elu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, HardSigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
+                                                                      LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Selu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softsign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Multinomial)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                            int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            uint8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            uint16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            uint32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            uint64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int8_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int32_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            int64_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 11, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            float, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            double, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
+                                                                            float, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
+                                                                            double, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            float, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            double, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
+                                                                            float, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
+                                                                            float, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Not)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, And)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Or)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Xor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            float, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            double, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
+                                                                            bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
+                                                                            int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
+                                                                            int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
+                                                                            float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 10,
+                                                                            double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
+                                                                            float, Mean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            float, Mean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Sin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Cos)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Tan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Asin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Acos)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                      Hardmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
+                                                                            float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
+                                                                            double, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
+                                                                            float, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
+                                                                            double, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                      ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6,
+                                                            InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float,
+                                                                  LpNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double,
+                                                                  LpNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
+                                                                      AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
+                                                                      MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, LpPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, GlobalLpPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MaxRoiPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            double, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int64_t, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int8_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            uint8_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                            int32_t, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                      Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12,
+                                                                      Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                      SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                      DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                      Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                      Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
+                                                                      Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            float, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            int32_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            int8_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            uint8_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            float, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            double, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            int8_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            int16_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            int32_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            int64_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            uint8_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            uint16_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            uint32_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            uint64_t, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            bool, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            MLFloat16, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                            string, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
+
+      // Opset 9
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                      Compress)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19,
+                                                                      ConstantOfShape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                      MeanVarianceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            double, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int32_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int64_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, EyeLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            double, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            MLFloat16, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_int64_t_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            float_int64_t_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_string_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            float_string_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            float_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_int32_t_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_float_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int32_t_float_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int32_t_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            int64_t_float_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                      MaxUnpool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sinh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Cosh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Asinh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                      Scatter)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, TfIdfVectorizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            bool, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int32_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int64_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            uint8_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            string, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            float, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            double, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            int32_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            int64_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                            uint8_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                      Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                                            double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            double, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int32_t, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                            int64_t, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                            float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                            double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
+                                                                            float, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
+                                                                            int32_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
+                                                                            int8_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
+                                                                            uint8_t, Upsample)>,
+
+      // Opset 10
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, StringNormalizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            float, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            double, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                      AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            int8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                            uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                            uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                            int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                            int32_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                            uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                            int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                  QLinearMatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                  QLinearMatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                  MatMulInteger)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                  MatMulInteger)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ConvInteger)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                  QLinearConv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                  QLinearConv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 11,
+                                                                      Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
+                                                                      NonMaxSuppression)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                            float, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                            double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
+      // opset 11
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
+                                                                            float, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
+                                                                            double, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
+                                                                            int32_t, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
+                                                                            int64_t, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
+                                                                  Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
+                                                                  DynamicQuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int8_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            uint8_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Hardmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Compress)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Det)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
+                                                                      AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17,
+                                                                      LpPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceAt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SplitToSequence)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                      GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
+                                                                  BitShift)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t,
+                                                                  BitShift)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint64_t,
+                                                                  BitShift)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                      GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Range)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Unique)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_int64_t_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  float_int64_t_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_string_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  float_string_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  float_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_int32_t_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_float_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int32_t_float_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int32_t_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_float_float, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
+                                                                  int64_t_float_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
+                                                                            int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            float, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int32_t, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            double, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            int64_t, ReduceSumSquare)>,
+
+      // OpSet 12
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Clip)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Min)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Max)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, Pow)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MaxPool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int8_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            uint8_t, ReduceMax)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            int8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            uint8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                      GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Einsum)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string,
+                                                                  Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf)>,
+      // REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
+      // However these types work on GPU implementation.
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+      // MLFloat16_MLFloat16, Dropout)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+      // MLFloat16_float, Dropout)>, BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider,
+      // kOnnxDomain, 12, MLFloat16_double, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            float_float, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            float_double, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            double_float, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                            double_double, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            float, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            double, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            int32_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            int64_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            float, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            double, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            int32_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                            int64_t, LessOrEqual)>,
+
+      // opset 13
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
+                                                                  MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            int32_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                      Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                            MeanVarianceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float,
+                                                                  Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double,
+                                                                  Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float,
+                                                                  Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double,
+                                                                  Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
+                                                                  ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
+                                                                  ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                      Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                            double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
+                                                                  Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                  int32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
+                                                                  int64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Mod)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                      Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                      Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 20,
+                                                                      Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Tile)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                      ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                      ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                      Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                            float, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                            double, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                            MLFloat16, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
+                                                                  NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
+                                                                  NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            double, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int8_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            uint8_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            double, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            uint8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            double, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int64_t, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                  ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
+                                                                  ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            int8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                            uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Hardmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                  Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                  Softmax)>,
+
+      // OpSet 14
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
+                                                                  CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
+                                                                  CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
+                                                                  CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, Trilu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18,
+                                                                      Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15,
+                                                                      Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                            float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                            double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN)>,
+
+      // Opset 15
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, float,
+                                                                  BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, double,
+                                                                  BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
-                                                                    OptionalHasElement)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
-                                                                    OptionalGetElement)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Optional)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                      OptionalHasElement)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                      OptionalGetElement)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Optional)>,
 #endif
 
-    // Opset 16
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18,
-                                                                    Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
-                                                                RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
-                                                                RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19,
-                                                                          float, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
-                                                                    ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
-                                                                    ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, string, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
-                                                                Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
-                                                                Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t,
-                                                                Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
-                                                                GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
-                                                                GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
-                                                                GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
-                                                                GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
-                                                                LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
-                                                                LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
-                                                                LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
-                                                                LessOrEqual)>,
-
-    // Opset 17
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, BlackmanWindow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, 19, DFT)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, HammingWindow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, HannWindow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, MelWeightMatrix)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, STFT)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, float,
-                                                                LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, double,
-                                                                LayerNormalization)>,
-
-    // Opset 18
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
-                                                                          float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
-                                                                          int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
-                                                                          int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
-                                                                          uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
-                                                                ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
-                                                                ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
-                                                                ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
-                                                                ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
-                                                                BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
-                                                                BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
-                                                                BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
-                                                                BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Split)>,
+      // Opset 16
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18,
+                                                                      Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
+                                                                  RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
+                                                                  RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19,
+                                                                            float, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                      ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                      ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, string, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
+                                                                  Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
+                                                                  Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t,
+                                                                  Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
+                                                                  GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
+                                                                  GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
+                                                                  GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
+                                                                  GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
+                                                                  LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
+                                                                  LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
+                                                                  LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
+                                                                  LessOrEqual)>,
+
+      // Opset 17
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, BlackmanWindow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, 19, DFT)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, HammingWindow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, HannWindow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, MelWeightMatrix)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, STFT)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, float,
+                                                                  LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 17, double,
+                                                                  LayerNormalization)>,
+
+      // Opset 18
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                            float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                            int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                            int8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                            uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
+                                                                  ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
+                                                                            ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
+                                                                  ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, float,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, double,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int32_t,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int64_t,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, int8_t,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 19, uint8_t,
+                                                                            ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
+                                                                  ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, double,
+                                                                  ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                  BitwiseAnd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                  BitwiseNot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                  BitwiseOr)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                  BitwiseXor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Split)>,
 #if !defined(DISABLE_OPTIONAL_TYPE)
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalHasElement)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalGetElement)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalHasElement)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, OptionalGetElement)>,
 #endif
 
-    // Opset 19
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          uint8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          int8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          int32_t, DequantizeLinear)>,
+      // Opset 19
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            int32_t, DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E4M3FN, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E4M3FNUZ, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E5M2, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E5M2FNUZ, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E4M3FN, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E4M3FNUZ, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E5M2, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E5M2FNUZ, DequantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
-                                                                Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t,
-                                                                Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                    Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          uint8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                  Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t,
+                                                                  Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                      Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            int8_t, QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E4M3FN, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E4M3FNUZ, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E5M2, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                          Float8E5M2FNUZ, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E4M3FN, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E4M3FNUZ, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E5M2, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                            Float8E5M2FNUZ, QuantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
-                                                                    Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
-                                                                Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
-                                                                Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t,
-                                                                Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Shape)>,
-
-    // Opset 20
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, 20,
-                                                                    ConstantOfShape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
-                                                                GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
-                                                                GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
-                                                                AffineGrid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
-                                                                AffineGrid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20,
+                                                                      Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                  Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
+                                                                  Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t,
+                                                                  Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, 20, Shape)>,
+
+      // Opset 20
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, 20,
+                                                                      ConstantOfShape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, bool, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                  GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                  GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                  AffineGrid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                  AffineGrid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
+                                                                  IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, BFloat16,
+                                                                  IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2,
-                                                                IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ,
-                                                                IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
+                                                                  IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ,
+                                                                  IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2,
+                                                                  IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ,
+                                                                  IsNaN)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit)>,
-
-    // Opset 21
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, ConstantOfShape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Shape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int32_t,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Int4x2,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, UInt4x2,
-                                                                DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit)>,
+
+      // Opset 21
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, ConstantOfShape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int32_t,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Int4x2,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, UInt4x2,
+                                                                  DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
-                                                                DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
-                                                                DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
+                                                                  DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
+                                                                  DequantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Int4x2,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, UInt4x2,
-                                                                QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint16_t,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Int4x2,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, UInt4x2,
+                                                                  QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
-                                                                QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
-                                                                QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2,
+                                                                  QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E5M2FNUZ,
+                                                                  QuantizeLinear)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
index 0c45b315f0280..758066d8a84e0 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc
@@ -41,16 +41,16 @@ TreeEnsembleClassifier<T>::TreeEnsembleClassifier(const OpKernelInfo& info) : Op
 
 template <typename T>
 Status TreeEnsembleClassifier<T>::GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
-  InlinedVector<std::string> names {
-    "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
-        "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
-        "nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids",
-        "class_weights", "classlabels_strings",
-        "classlabels_int64s"
+  InlinedVector<std::string> names{
+      "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
+      "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
+      "nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids",
+      "class_weights", "classlabels_strings",
+      "classlabels_int64s"
 #if !defined(ORT_MINIMAL_BUILD)
-        "base_values_as_tensor",
-        "nodes_hitrates_as_tensor", "nodes_values_as_tensor",
-        "class_weights_as_tensor"
+      "base_values_as_tensor",
+      "nodes_hitrates_as_tensor", "nodes_values_as_tensor",
+      "class_weights_as_tensor"
 #endif
   };
   removable_attributes.swap(names);
diff --git a/onnxruntime/core/providers/cpu/ml/treeregressor.cc b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
index 17f5cf32960da..6b5b972d3c929 100644
--- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc
@@ -48,16 +48,16 @@ TreeEnsembleRegressor<T>::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKe
 
 template <typename T>
 Status TreeEnsembleRegressor<T>::GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
-  InlinedVector<std::string> names {
-    "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
-        "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
-        "nodes_truenodeids", "nodes_values",
-        "target_ids", "target_treeids", "target_nodeids",
-        "target_weights"
+  InlinedVector<std::string> names{
+      "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates",
+      "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids",
+      "nodes_truenodeids", "nodes_values",
+      "target_ids", "target_treeids", "target_nodeids",
+      "target_weights"
 #if !defined(ORT_MINIMAL_BUILD)
-        "base_values_as_tensor",
-        "nodes_hitrates_as_tensor", "nodes_values_as_tensor",
-        "class_weights_as_tensor"
+      "base_values_as_tensor",
+      "nodes_hitrates_as_tensor", "nodes_values_as_tensor",
+      "class_weights_as_tensor"
 #endif
   };
   removable_attributes.swap(names);
diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
index 4a176b0726a18..721c2064fae03 100644
--- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc
@@ -195,8 +195,8 @@ Status NonMaxSuppression::Compute(OpKernelContext* ctx) const {
         }
         sorted_boxes.pop();
       }  // while
-    }    // for class_index
-  }      // for batch_index
+    }  // for class_index
+  }  // for batch_index
 
   constexpr auto last_dim = 3;
   const auto num_selected = selected_indices.size();
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index ead2ccaef002e..d8c81e5cb63e5 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -251,9 +251,9 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa
 
             top_data[index] = output_val;
           }  // for pw
-        }    // for ph
-      }      // for c
-    }        // for n
+        }  // for ph
+      }  // for c
+    }  // for n
   });
 }
 }  // namespace
diff --git a/onnxruntime/core/providers/cpu/tensor/expand.cc b/onnxruntime/core/providers/cpu/tensor/expand.cc
index 6ead2ea73460b..b0c636281bc7a 100644
--- a/onnxruntime/core/providers/cpu/tensor/expand.cc
+++ b/onnxruntime/core/providers/cpu/tensor/expand.cc
@@ -128,7 +128,7 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
           memcpy(output_data + output_offset, input_data + input_offset, onnxruntime::narrow<size_t>(copy_byte));
           output_offsets[onnxruntime::narrow<size_t>(i)] = output_offset;
         }  // for i
-      };   // distribute_fn
+      };  // distribute_fn
 
   auto per_thread_tasks =
       distribute_count / concurrency::ThreadPool::DegreeOfParallelism(context->GetOperatorThreadPool());
@@ -169,9 +169,9 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
                   copy_byte >>= 1;
                 }
               }  // while
-            }    // if
-          }      // for
-        };       // copy_fn
+            }  // if
+          }  // for
+        };  // copy_fn
     if (per_thread_tasks > 20) {
       concurrency::ThreadPool::TryParallelFor(
           context->GetOperatorThreadPool(),
@@ -181,7 +181,7 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
     } else {
       copy_fn(0, onnxruntime::narrow<std::ptrdiff_t>(distribute_count));
     }  // else
-  }    // for
+  }  // for
   return Status::OK();
 }  // Expand::compute
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 8c03e489d298d..5771380433b35 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1394,916 +1394,916 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 
 static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
 #ifndef USE_CUDA_MINIMAL
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Sin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Sin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Sin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, MLFloat16, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, double, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, MLFloat16, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, MatMulInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Elu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Elu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Elu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, float, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, double, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, MLFloat16, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Softplus)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Softplus)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Softplus)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, float, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, double, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, MLFloat16, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, float, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, double, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, MLFloat16, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, And)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Or)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Xor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 7, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 12, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 11, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 11, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 12, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int32_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int64_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint32_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint64_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, float, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, double, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, MLFloat16, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int32_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int64_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint32_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint64_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, float, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, double, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, MLFloat16, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int16_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int32_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int64_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, bool, Not)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, MLFloat16, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, float, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, double, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, MLFloat16, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, GlobalAveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalAveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, double, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, MLFloat16, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, double, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, MLFloat16, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, GlobalMaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalMaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, double, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, MLFloat16, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, float, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, double, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, MLFloat16, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, int32_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, int64_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, float, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, double, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, MLFloat16, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, bool, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 5, 12, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, int64_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, int32_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, uint8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, float, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, double, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, MLFloat16, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, int32_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, uint8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, ConstantOfShape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int8_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int16_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint8_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint16_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint32_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint64_t, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, MLFloat16, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, float, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, double_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, int32_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, int64_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, uint8_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint8_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
-
-    // opset 10
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 11, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int64_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10,
-                                                                    19, IsInf)>,
-
-    // opset 11
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, double, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, MLFloat16, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int64_t, GatherND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceAt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceLength)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, double, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, MLFloat16, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_int64_t_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_float_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t_float_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_MLFloat16_int64_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t_MLFloat16_int32_t, OneHot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
-
-    // OpSet 12
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Clip)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, float, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, double, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, int8_t, MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, uint8_t, MaxPool)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Pow)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int64_t, GatherND)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, Einsum)>,
-
-    // OpSet 13
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Erf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Max)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Min)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, float, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, double, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, MLFloat16, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, bool, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Transpose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Unsqueeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, GatherElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, bool, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint16_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, BFloat16, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, ReduceSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Mod)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear)>,
-
-    // OpSet 14
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, RNN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, GRU)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
-        kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
-        kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
-        kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, Trilu)>,
-
-    // OpSet 15
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, float, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, MLFloat16, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
-
-    // Opset 16
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, PRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, BFloat16, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint8_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint32_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint64_t, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint32_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint64_t, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
-
-    // Opset 17
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
-
-    // Opset 18
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize)>,
-
-    // Opset 19
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, double, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, MLFloat16, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint16_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint32_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint64_t, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, bool, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, BFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Sin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Sin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Sin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, double, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, MLFloat16, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, int8_t, MatMulInteger)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Elu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Elu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Elu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, HardSigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, HardSigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, HardSigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, float, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, double, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 15, MLFloat16, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Selu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Selu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Selu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Softsign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Softsign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Softsign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Softplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Softplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Softplus)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, float, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, double, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 11, MLFloat16, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, float, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, double, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, MLFloat16, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, And)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Or)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Xor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 7, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 12, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 11, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 11, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 12, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int32_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int64_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint32_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint64_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, float, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, double, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, MLFloat16, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int32_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, int64_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint32_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, uint64_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, float, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, double, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 15, MLFloat16, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, uint64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 12, MLFloat16, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, uint64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int8_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int16_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int32_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, int64_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, float, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, double, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, bool, Not)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 13, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, MLFloat16, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 7, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 9, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, double, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, MLFloat16, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, uint8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, float, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, double, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, MLFloat16, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, int32_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, int64_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, float, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, double, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, MLFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, int64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, uint64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 8, bool, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, float, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, double, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, MLFloat16, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 5, 12, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Tile)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, float, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, double, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 13, MLFloat16, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, int64_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Compress)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, int32_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, uint8_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, float, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, double, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, MLFloat16, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, int32_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 9, uint8_t, Upsample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 2, 10, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, ConstantOfShape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int8_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int16_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint8_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint16_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint32_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, uint64_t, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, Shrink)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, EyeLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scatter)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, MLFloat16, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, float, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, double_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, int32_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, int64_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 15, uint8_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, bool, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, uint8_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int32_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, int64_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, float, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 12, MLFloat16, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 9, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
+
+      // opset 10
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 11, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int64_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, ThresholdedRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, ThresholdedRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, ThresholdedRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 12, Mod)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10,
+                                                                      19, IsInf)>,
+
+      // opset 11
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, double, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 11, MLFloat16, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Compress)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, int64_t, GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Range)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceAt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceLength)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, uint64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Round)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_int64_t_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_float_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t_float_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t_MLFloat16_int64_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t_MLFloat16_int32_t, OneHot)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+
+      // OpSet 12
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Clip)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, int8_t, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, uint8_t, MaxPool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Pow)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, int64_t, GatherND)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, Einsum)>,
+
+      // OpSet 13
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, uint64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Neg)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Floor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Ceil)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Reciprocal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sqrt)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Log)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Exp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Erf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Expand)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Sum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Max)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Min)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Greater)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Less)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, NonZero)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, float, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, double, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, MLFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, bool, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 19, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Transpose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Slice)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LogSoftmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Squeeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Concat)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, GatherND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Dropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, uint8_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Flatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LRN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, float, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, double, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, MLFloat16, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, bool, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint8_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint16_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint32_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, uint64_t, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, BFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Softmax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, MatMul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sigmoid)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Tanh)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Mod)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear)>,
+
+      // OpSet 14
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint32_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, uint64_t, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, RNN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, GRU)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Add)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Sub)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Mul)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Div)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, Trilu)>,
+
+      // OpSet 15
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, Pow)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, double, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
+
+      // Opset 16
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, PRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, BFloat16, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint8_t, Where)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint32_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint64_t, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, GreaterOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int32_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, int64_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint32_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, uint64_t, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
+
+      // Opset 17
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, double, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, BFloat16, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, MLFloat16, LayerNormalization)>,
+
+      // Opset 18
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, Split)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterND)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, Resize)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, Resize)>,
+
+      // Opset 19
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, float, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, double, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, MLFloat16, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint16_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint32_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint64_t, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, bool, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, BFloat16, Cast)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, Cast)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop)>,
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear)>,
 #endif
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
 
-    // Opset 20
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
+      // Opset 20
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
index 064994c1f14ae..dd03db94b631c 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.h
+++ b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -18,7 +18,7 @@ constexpr CudaGraphAnnotation_t kCudaGraphAnnotationSkip = -1;
 constexpr CudaGraphAnnotation_t kCudaGraphAnnotationDefault = 0;
 
 struct CudaGraphSet {
-  CudaGraphSet(){};
+  CudaGraphSet() {};
   ~CudaGraphSet();
 
   void Clear();
@@ -31,7 +31,7 @@ struct CudaGraphSet {
 };
 
 struct CUDAGraphManager {
-  CUDAGraphManager(){};
+  CUDAGraphManager() {};
   CUDAGraphManager(cudaStream_t stream);
   ~CUDAGraphManager();
 
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h
index 88c9adc5e17b3..4930e55351615 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.h
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.h
@@ -33,7 +33,7 @@ class CudaProfiler final : public EpProfiler {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler);
   ~CudaProfiler() {}
   bool StartProfiling(TimePoint) override { return true; }
-  void EndProfiling(TimePoint, Events&) override{};
+  void EndProfiling(TimePoint, Events&) override {};
   void Start(uint64_t) override{};
   void Stop(uint64_t) override{};
 };
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
index 77c9d94162b6b..71ad3ee6e2147 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
@@ -18,7 +18,7 @@ namespace cuda {
 template <typename T, bool NHWC>
 class ConvTranspose : public CudaKernel {
  public:
-  ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info){};
+  ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info) {};
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/onnxruntime/core/providers/cuda/nvtx_profile.h b/onnxruntime/core/providers/cuda/nvtx_profile.h
index f98745cbfc5c2..e545578a72fc4 100644
--- a/onnxruntime/core/providers/cuda/nvtx_profile.h
+++ b/onnxruntime/core/providers/cuda/nvtx_profile.h
@@ -45,7 +45,7 @@ enum class Color : uint32_t {
 class RangeCreatorBase {
  public:
   RangeCreatorBase(const std::string message, const Color color)
-      : message_(message), color_(color), is_begin_called_(false), is_end_called_(false){};
+      : message_(message), color_(color), is_begin_called_(false), is_end_called_(false) {};
 
   // Check if Begin and End are both called.
   // It's pointless if not all of them are called.
@@ -100,7 +100,7 @@ class RangeCreatorBase {
 class NvtxRangeCreator final : public RangeCreatorBase {
  public:
   NvtxRangeCreator(const std::string message, const Color color)
-      : RangeCreatorBase(message, color){};
+      : RangeCreatorBase(message, color) {};
 
   void BeginImpl() override;
   void EndImpl() override;
@@ -114,7 +114,7 @@ class NvtxRangeCreator final : public RangeCreatorBase {
 class NvtxNestedRangeCreator final : public RangeCreatorBase {
  public:
   NvtxNestedRangeCreator(const std::string message, const Color color)
-      : RangeCreatorBase(message, color){};
+      : RangeCreatorBase(message, color) {};
 
   void BeginImpl() override;
   void EndImpl() override;
@@ -123,7 +123,7 @@ class NvtxNestedRangeCreator final : public RangeCreatorBase {
 class NvtxMarkerCreator final {
  public:
   NvtxMarkerCreator(const std::string message, const Color color)
-      : message_(message), color_(color){};
+      : message_(message), color_(color) {};
   void Mark();
 
  private:
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index 1f7df9b6fc2e3..ed642754af3ba 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -35,7 +35,7 @@ enum class BroadcastIndexType : int32_t {
 template <typename T>
 class IConstantBuffer {
  public:
-  virtual ~IConstantBuffer(){};
+  virtual ~IConstantBuffer() {};
   virtual const T* GetBuffer(cudaStream_t stream, size_t count) = 0;
 };
 
diff --git a/onnxruntime/core/providers/cuda/tensor/cast_op.cc b/onnxruntime/core/providers/cuda/tensor/cast_op.cc
index 8e5a68e2a278e..821695bbbd42f 100644
--- a/onnxruntime/core/providers/cuda/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cuda/tensor/cast_op.cc
@@ -13,23 +13,23 @@ const std::vector<MLDataType>& CastOpTypeConstraints() {
   // Must be done as a local static for a shared provider, to avoid the prefast warning:
   // Global initializer calls a non-constexpr function 'onnxruntime::DataTypeImpl::GetTensorType<onnxruntime::MLFloat16>'
   // In a shared provider, GetTensorType is a function call into Onnxruntime and isn't constexpr
-  static std::vector<MLDataType> types {
-    DataTypeImpl::GetTensorType<MLFloat16>(),
-        DataTypeImpl::GetTensorType<BFloat16>(),
-        DataTypeImpl::GetTensorType<float>(),
-        DataTypeImpl::GetTensorType<double>(),
-        DataTypeImpl::GetTensorType<int8_t>(),
-        DataTypeImpl::GetTensorType<int16_t>(),
-        DataTypeImpl::GetTensorType<int32_t>(),
-        DataTypeImpl::GetTensorType<int64_t>(),
-        DataTypeImpl::GetTensorType<uint8_t>(),
-        DataTypeImpl::GetTensorType<uint16_t>(),
-        DataTypeImpl::GetTensorType<uint32_t>(),
-        DataTypeImpl::GetTensorType<uint64_t>(),
-        DataTypeImpl::GetTensorType<bool>()
+  static std::vector<MLDataType> types{
+      DataTypeImpl::GetTensorType<MLFloat16>(),
+      DataTypeImpl::GetTensorType<BFloat16>(),
+      DataTypeImpl::GetTensorType<float>(),
+      DataTypeImpl::GetTensorType<double>(),
+      DataTypeImpl::GetTensorType<int8_t>(),
+      DataTypeImpl::GetTensorType<int16_t>(),
+      DataTypeImpl::GetTensorType<int32_t>(),
+      DataTypeImpl::GetTensorType<int64_t>(),
+      DataTypeImpl::GetTensorType<uint8_t>(),
+      DataTypeImpl::GetTensorType<uint16_t>(),
+      DataTypeImpl::GetTensorType<uint32_t>(),
+      DataTypeImpl::GetTensorType<uint64_t>(),
+      DataTypeImpl::GetTensorType<bool>()
 #if !defined(DISABLE_FLOAT8_TYPES)
-            ,
-        DataTypeImpl::GetTensorType<Float8E4M3FN>(), DataTypeImpl::GetTensorType<Float8E5M2>()
+          ,
+      DataTypeImpl::GetTensorType<Float8E4M3FN>(), DataTypeImpl::GetTensorType<Float8E5M2>()
 #endif
   };
   return types;
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
index 3ed3705f6d81b..f67b70616547c 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
@@ -42,7 +42,7 @@ enum ORT_DataType : int {
  */
 class DnnlNodeCapability {
  public:
-  virtual ~DnnlNodeCapability(){};
+  virtual ~DnnlNodeCapability() {};
   /**
    * virtual function expected to be implemented for different node
    * types.
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
index ceac2a6f58b32..add9f440df91f 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h
@@ -18,7 +18,7 @@ class DnnlNode;
 class DnnlNodeArg {
  public:
   DnnlNodeArg(DnnlNode* node, size_t index, bool is_output)
-      : node_(node), index_(index), is_output_(is_output){};
+      : node_(node), index_(index), is_output_(is_output) {};
   DnnlNodeArg() = default;
   DnnlNode* GetNode() { return node_; };
   size_t GetIndex() { return index_; };
diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h
index 6aa8313c01f38..267015b2ea58d 100644
--- a/onnxruntime/core/providers/js/allocator.h
+++ b/onnxruntime/core/providers/js/allocator.h
@@ -15,7 +15,7 @@ class JsCPUAllocator : public CPUAllocator {
       : CPUAllocator(
             OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator,
                           OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0),
-                          0, OrtMemTypeCPU)){};
+                          0, OrtMemTypeCPU)) {};
 };
 
 class JsCustomAllocator : public IAllocator {
diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h
index 3dfb19cfde5ac..6a0e8586776a2 100644
--- a/onnxruntime/core/providers/js/data_transfer.h
+++ b/onnxruntime/core/providers/js/data_transfer.h
@@ -11,8 +11,8 @@ namespace js {
 
 class DataTransfer : public IDataTransfer {
  public:
-  DataTransfer(){};
-  ~DataTransfer(){};
+  DataTransfer() {};
+  ~DataTransfer() {};
 
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 9e31cf9cae21a..d0f6ce9effd9e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -125,7 +125,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
       default:
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
     }  // switch
-  }    // if-else
+  }  // if-else
 
   const std::string& output_name = node_unit.Outputs()[0].node_arg.Name();
   std::string shape_input_name(input_name + "_" + output_name);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index b7455314578de..5fc6d42a8a179 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -163,7 +163,7 @@ Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper,
       default:
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
     }  // switch
-  }    // if-else
+  }  // if-else
 
   QnnParamWrapper constant_value_param(node_unit.Index(),
                                        node_unit.Name(),
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
index da2d517f65697..5fc4fb3db4122 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -10,7 +10,7 @@
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 
 #define ALIGN_PTR_UP(ptr, align, type) \
-  reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align)-1) & ~((align)-1))
+  reinterpret_cast<type>((reinterpret_cast<std::uintptr_t>(ptr) + (align) - 1) & ~((align) - 1))
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/rocm/rocm_profiler.h b/onnxruntime/core/providers/rocm/rocm_profiler.h
index 070cca570f481..d5c7e3f273565 100644
--- a/onnxruntime/core/providers/rocm/rocm_profiler.h
+++ b/onnxruntime/core/providers/rocm/rocm_profiler.h
@@ -34,7 +34,7 @@ class RocmProfiler final : public EpProfiler {
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RocmProfiler);
   ~RocmProfiler() {}
   bool StartProfiling(TimePoint) override { return true; }
-  void EndProfiling(TimePoint, Events&) override{};
+  void EndProfiling(TimePoint, Events&) override {};
   void Start(uint64_t) override{};
   void Stop(uint64_t) override{};
 };
diff --git a/onnxruntime/core/providers/shared_library/provider_host_api.h b/onnxruntime/core/providers/shared_library/provider_host_api.h
index 43d661344d787..e25426b5124dd 100644
--- a/onnxruntime/core/providers/shared_library/provider_host_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_host_api.h
@@ -24,10 +24,10 @@ struct Provider {
   virtual ProviderOptions GetProviderOptions(const void* /*provider options struct*/) { return {}; }
 
   // Update provider options from key-value string configuration
-  virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/){};
+  virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/) {};
 
   // Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates.
-  virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector<OrtCustomOpDomain*>& /*provider custom op domain list*/){};
+  virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector<OrtCustomOpDomain*>& /*provider custom op domain list*/) {};
 
   virtual void Initialize() = 0;  // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
   virtual void Shutdown() = 0;    // Called right before unloading the shared library
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index 54212d34aa2ce..a72de6ed75399 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -24,8 +24,8 @@ struct TensorRTCustomKernel {
       : compute_stream_(compute_stream) {
   }
 
-  void Compute(OrtKernelContext* /*context*/){
-      // The implementation is in TensorRT plugin. No need to implement it here.
+  void Compute(OrtKernelContext* /*context*/) {
+    // The implementation is in TensorRT plugin. No need to implement it here.
   };
 
  private:
diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
index dc34419ef936f..453db30e1320f 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
@@ -46,7 +46,7 @@ struct VitisAI_Provider : Provider {
     }
   };
   // Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates.
-  void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector<OrtCustomOpDomain*>&) override{};
+  void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector<OrtCustomOpDomain*>&) override {};
   // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded
   void Initialize() override { initialize_vitisai_ep(); }
   // Called right before unloading the shared library
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
index df2e429f58b2f..4c10ba01b1c2e 100644
--- a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
@@ -47,7 +47,7 @@ namespace npu {
                        std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,                    \
                        const NodeUnit& node_unit) override {                                      \
       LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op";                                \
-      auto op = graph_ep->GetGraph() -> CreateOperation<tim::vx::ops::vsinpu_op_kind>();          \
+      auto op = graph_ep->GetGraph()->CreateOperation<tim::vx::ops::vsinpu_op_kind>();            \
       (*op).BindInputs(inputs).BindOutputs(outputs);                                              \
       return true;                                                                                \
       ;                                                                                           \
diff --git a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
index 27c148c1672c5..dc0969429b8ff 100644
--- a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h
@@ -60,10 +60,9 @@ using createIOpBuildItemFunc = std::function<std::unique_ptr<IOpBuilder>()>;
 using OpBuildItemType = std::map<std::string, std::unique_ptr<IOpBuilder>>;
 
 static const std::map<std::string, createIOpBuildItemFunc> reg = {
-#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE)           \
-  {                                                                 \
-    ONNX_NODE_TYPE, [] { return std::make_unique<BUILDER_TYPE>(); } \
-  }
+#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE) \
+  {                                                       \
+      ONNX_NODE_TYPE, [] { return std::make_unique<BUILDER_TYPE>(); }}
 
     REGISTER_OP_BUILDER("Add", AddOpBuilder),
     REGISTER_OP_BUILDER("Sub", SubOpBuilder),
diff --git a/onnxruntime/python/onnxruntime_pybind_iobinding.cc b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
index 51a52dbfcb3bc..37081cd0ff2b4 100644
--- a/onnxruntime/python/onnxruntime_pybind_iobinding.cc
+++ b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
@@ -155,11 +155,7 @@ void addIoBindingMethods(pybind11::module& m) {
       .def("clear_binding_outputs", [](SessionIOBinding* io_binding) -> void {
         io_binding->Get()->ClearOutputs();
       })
-      .def(
-          "get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector<OrtValue>& {
-            return io_binding->Get()->GetOutputs();
-          },
-          py::return_value_policy::reference_internal)
+      .def("get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector<OrtValue>& { return io_binding->Get()->GetOutputs(); }, py::return_value_policy::reference_internal)
       .def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> py::list {
         const std::vector<OrtValue>& outputs = io_binding->Get()->GetOutputs();
 
@@ -180,8 +176,7 @@ void addIoBindingMethods(pybind11::module& m) {
           }
           ++pos;
         }
-        return result;
-      });
+        return result; });
 }
 
 }  // namespace python
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index 94235b3043bc7..d76b9032afe73 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -226,7 +226,7 @@ void addOrtValueMethods(pybind11::module& m) {
 
         ORT_THROW("Only OrtValues that are Tensors/SparseTensors are currently supported");
 #else
-            ORT_THROW("Only OrtValues that are Tensors are supported in this build");
+        ORT_THROW("Only OrtValues that are Tensors are supported in this build");
 #endif
       })
       .def("shape", [](const OrtValue* ort_value) -> py::list {
@@ -275,26 +275,15 @@ void addOrtValueMethods(pybind11::module& m) {
 
         return *ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(*type_proto);
       })
-      .def(
-          "element_type", [](const OrtValue* ort_value) -> int32_t {
-            return GetTensorProtoType(*ort_value);
-          },
-          "Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. "
-          "This integer is one type defined by ONNX TensorProto_DataType "
-          "(such as onnx.TensorProto.FLOAT)."
-          "Raises an exception in any other case.")
-      .def("has_value", [](const OrtValue* ort_value) -> bool {
-        return ort_value->IsAllocated();
-      })
-      .def("is_tensor", [](const OrtValue* ort_value) -> bool {
-        return ort_value->IsTensor();
-      })
-      .def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool {
-        return ort_value->IsSparseTensor();
-      })
-      .def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool {
-        return ort_value->IsTensorSequence();
-      })
+      .def("element_type", [](const OrtValue* ort_value) -> int32_t { return GetTensorProtoType(*ort_value); },
+           "Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. "
+           "This integer is one type defined by ONNX TensorProto_DataType "
+           "(such as onnx.TensorProto.FLOAT)."
+           "Raises an exception in any other case.")
+      .def("has_value", [](const OrtValue* ort_value) -> bool { return ort_value->IsAllocated(); })
+      .def("is_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensor(); })
+      .def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsSparseTensor(); })
+      .def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensorSequence(); })
       // Converts Tensor into a numpy array
       .def("numpy", [](const OrtValue* ml_value) -> py::object {
         ORT_ENFORCE(ml_value->IsTensor(), "Only OrtValues that are Tensors are convertible to Numpy objects");
@@ -310,37 +299,22 @@ void addOrtValueMethods(pybind11::module& m) {
 #else
         py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
 #endif
-        return obj;
-      })
+        return obj; })
 #ifdef ENABLE_TRAINING
-      .def(
-          "to_dlpack", [](OrtValue* ort_value) -> py::object {
-            return py::reinterpret_steal<py::object>(ToDlpack(*ort_value));
-          },
-          "Returns a DLPack representing the tensor. This method does not copy the pointer shape, "
-          "instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure "
-          "is consumed.")
-      .def_static(
-          "from_dlpack", [](py::object data, bool is_bool_tensor) {
-            return FromDlpack(data.ptr(), is_bool_tensor);
-          },
-          py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.")
-      .def(
-          "__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object {
-            return py::reinterpret_steal<py::object>(ToDlpack(*ort_value));
-          },
-          py::arg("stream") = py::none(),
-          "Returns a DLPack representing the tensor (part of __dlpack__ protocol). "
-          "This method does not copy the pointer shape, instead, it copies the pointer value. "
-          "The OrtValue must persist until the dlpack structure is consumed.")
-      .def(
-          "__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple {
+      .def("to_dlpack", [](OrtValue* ort_value) -> py::object { return py::reinterpret_steal<py::object>(ToDlpack(*ort_value)); },
+           "Returns a DLPack representing the tensor. This method does not copy the pointer shape, "
+           "instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure "
+           "is consumed.")
+      .def_static("from_dlpack", [](py::object data, bool is_bool_tensor) { return FromDlpack(data.ptr(), is_bool_tensor); }, py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.")
+      .def("__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object { return py::reinterpret_steal<py::object>(ToDlpack(*ort_value)); }, py::arg("stream") = py::none(),
+           "Returns a DLPack representing the tensor (part of __dlpack__ protocol). "
+           "This method does not copy the pointer shape, instead, it copies the pointer value. "
+           "The OrtValue must persist until the dlpack structure is consumed.")
+      .def("__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple {
             ORT_ENFORCE(ort_value->IsTensor(), "Only tensor type OrtValues are supported");
             const onnxruntime::Tensor& tensor = ort_value->Get<Tensor>();
             DLDevice device = onnxruntime::dlpack::GetDlpackDevice(*ort_value, tensor.Location().device.Id());
-            return py::make_tuple(static_cast<int>(device.device_type), device.device_id);
-          },
-          "Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).")
+            return py::make_tuple(static_cast<int>(device.device_type), device.device_id); }, "Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).")
 #endif
       ;
 
@@ -350,13 +324,8 @@ void addOrtValueMethods(pybind11::module& m) {
         v->push_back(ortvalue);
       })
 #ifdef ENABLE_TRAINING
-      .def(
-          "push_back", [](std::vector<OrtValue>* v, py::object dlpack_tensor, const bool is_bool_tensor) {
-            v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor));
-          },
-          "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false)
-      .def(
-          "push_back_batch", [](std::vector<OrtValue>* v, std::vector<py::object>& torch_tensors, std::vector<int64_t>& data_ptrs, std::vector<py::object>& element_types, const std::vector<std::vector<int64_t>>& shapes, const std::vector<OrtDevice>& devices) {
+      .def("push_back", [](std::vector<OrtValue>* v, py::object dlpack_tensor, const bool is_bool_tensor) { v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor)); }, "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false)
+      .def("push_back_batch", [](std::vector<OrtValue>* v, std::vector<py::object>& torch_tensors, std::vector<int64_t>& data_ptrs, std::vector<py::object>& element_types, const std::vector<std::vector<int64_t>>& shapes, const std::vector<OrtDevice>& devices) {
             for (size_t i = 0; i < torch_tensors.size(); ++i) {
               py::object& element_type = element_types.at(i);
               const std::vector<int64_t>& shape = shapes.at(i);
@@ -377,52 +346,36 @@ void addOrtValueMethods(pybind11::module& m) {
               OrtValue ml_value;
               Tensor::InitOrtValue(ml_type, gsl::make_span(shape), reinterpret_cast<void*>(data_ptr), info, ml_value);
               v->push_back(ml_value);
-            }
-          },
-          "Add a batch of OrtValue's by wrapping PyTorch tensors.")
+            } }, "Add a batch of OrtValue's by wrapping PyTorch tensors.")
 #endif
       .def("reserve", [](std::vector<OrtValue>* v, const size_t len) { v->reserve(len); })
       .def("shrink_to_fit", [](std::vector<OrtValue>* v) { v->shrink_to_fit(); })
       .def("__len__", [](const std::vector<OrtValue>& v) { return v.size(); })
-      .def(
-          "__iter__", [](const std::vector<OrtValue>& v) {
-            return py::make_iterator(v.cbegin(), v.cend());
-          },
-          py::keep_alive<0, 1>())
-      .def("__getitem__", [](const std::vector<OrtValue>& v, const size_t idx) {
-        return v.at(idx);
-      })
-      .def(
-          "bool_tensor_indices", [](std::vector<OrtValue>* v) -> std::vector<int64_t> {
+      .def("__iter__", [](const std::vector<OrtValue>& v) { return py::make_iterator(v.cbegin(), v.cend()); }, py::keep_alive<0, 1>())
+      .def("__getitem__", [](const std::vector<OrtValue>& v, const size_t idx) { return v.at(idx); })
+      .def("bool_tensor_indices", [](std::vector<OrtValue>* v) -> std::vector<int64_t> {
             std::vector<int64_t> indices;
             for (size_t i = 0; i < v->size(); ++i) {
               if (GetTensorProtoType((*v)[i]) == ONNX_NAMESPACE::TensorProto_DataType_BOOL) {
                 indices.push_back(static_cast<int64_t>(i));
               }
             }
-            return indices;
-          },
-          "Returns the indices of every boolean tensor in this vector of OrtValue. "
-          "In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. "
-          "If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor "
-          "to get a boolean tensor.")
+            return indices; },
+           "Returns the indices of every boolean tensor in this vector of OrtValue. "
+           "In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. "
+           "If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor "
+           "to get a boolean tensor.")
 #ifdef ENABLE_TRAINING
-      .def("dlpack_at", [](std::vector<OrtValue>* v, const size_t idx) {
-        return py::reinterpret_steal<py::object>(ToDlpack(v->at(idx)));
-      })
+      .def("dlpack_at", [](std::vector<OrtValue>* v, const size_t idx) { return py::reinterpret_steal<py::object>(ToDlpack(v->at(idx))); })
 #endif
-      .def(
-          "element_type_at", [](std::vector<OrtValue>* v, const size_t idx) -> int32_t {
-            return GetTensorProtoType(v->at(idx));
-          },
-          "Returns an integer equal to the ONNX proto type of the tensor at position i. "
-          "This integer is one type defined by ONNX TensorProto_DataType "
-          "(such as onnx.TensorProto.FLOAT)."
-          "Raises an exception in any other case.",
-          py::arg("idx"))
+      .def("element_type_at", [](std::vector<OrtValue>* v, const size_t idx) -> int32_t { return GetTensorProtoType(v->at(idx)); },
+           "Returns an integer equal to the ONNX proto type of the tensor at position i. "
+           "This integer is one type defined by ONNX TensorProto_DataType "
+           "(such as onnx.TensorProto.FLOAT)."
+           "Raises an exception in any other case.",
+           py::arg("idx"))
 #ifdef ENABLE_TRAINING
-      .def(
-          "to_dlpacks", [](const std::vector<OrtValue>& v, py::object to_tensor) -> py::list {
+      .def("to_dlpacks", [](const std::vector<OrtValue>& v, py::object to_tensor) -> py::list {
             if (v.size() == 0)
               return py::list();
 
@@ -469,9 +422,8 @@ void addOrtValueMethods(pybind11::module& m) {
                 Py_DECREF(capsule);
               }
             }
-            return list_dlpacks;
-          },
-          R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates
+            return list_dlpacks; },
+           R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates
 a DLPack structure for every tensors, then calls python function `to_tensor` to a new object
 consuming the DLPack structure or return a list of capsule if this function is None.
 
@@ -488,7 +440,7 @@ It creates many tensors acquiring ownership of existing OrtValue.
 This method saves one object creation and an C++ allocation
 for every transferred tensor.
 )pbdoc",
-          py::arg("to_tensor"))
+           py::arg("to_tensor"))
 #endif
       ;
 
diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
index 7dcead113ac4f..1154f3b9f88b8 100644
--- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
+++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
@@ -397,8 +397,7 @@ void addSparseTensorMethods(pybind11::module& m) {
       // pybind apparently has a bug with returning enums from def_property_readonly or methods
       // returning a method object instead of the enumeration value
       // so we are using def_property and throw on a potential modification
-      .def_property(
-          "format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
+      .def_property("format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
         const SparseTensor& tensor = py_tensor->Instance();
         auto retval = OrtSparseFormat::ORT_SPARSE_UNDEFINED;
         switch (tensor.Format()) {
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 6b5daf8cb882b..679ccce7fb07a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1425,7 +1425,7 @@ void addGlobalMethods(py::module& m) {
     ORT_UNUSED_PARAMETER(algo);
     ORT_THROW("set_cudnn_conv_algo_search is not supported in ROCM");
 #else
-        cudnn_conv_algo_search = algo;
+    cudnn_conv_algo_search = algo;
 #endif
   });
   // TODO remove deprecated global config
@@ -1436,7 +1436,7 @@ void addGlobalMethods(py::module& m) {
     ORT_UNUSED_PARAMETER(use_single_stream);
     ORT_THROW("set_do_copy_in_default_stream is not supported in ROCM");
 #else
-        do_copy_in_default_stream = use_single_stream;
+    do_copy_in_default_stream = use_single_stream;
 #endif
   });
   // TODO remove deprecated global config
@@ -1801,10 +1801,10 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
         }
         ORT_THROW_IF_ERROR(options->value.AddExternalInitializers(names_ptrs, values_ptrs));
 #else
-            ORT_UNUSED_PARAMETER(options);
-            ORT_UNUSED_PARAMETER(names);
-            ORT_UNUSED_PARAMETER(ort_values);
-            ORT_THROW("External initializers are not supported in this build.");
+        ORT_UNUSED_PARAMETER(options);
+        ORT_UNUSED_PARAMETER(names);
+        ORT_UNUSED_PARAMETER(ort_values);
+        ORT_THROW("External initializers are not supported in this build.");
 #endif
       });
 
@@ -1866,8 +1866,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
             return *(na.Type());
           },
           "node type")
-      .def(
-          "__str__", [](const onnxruntime::NodeArg& na) -> std::string {
+      .def("__str__", [](const onnxruntime::NodeArg& na) -> std::string {
             std::ostringstream res;
             res << "NodeArg(name='" << na.Name() << "', type='" << *(na.Type()) << "', shape=";
             auto shape = na.Shape();
@@ -1893,11 +1892,8 @@ including arg name, arg type (contains both type and shape).)pbdoc")
             }
             res << ")";
 
-            return std::string(res.str());
-          },
-          "converts the node into a readable string")
-      .def_property_readonly(
-          "shape", [](const onnxruntime::NodeArg& na) -> std::vector<py::object> {
+            return std::string(res.str()); }, "converts the node into a readable string")
+      .def_property_readonly("shape", [](const onnxruntime::NodeArg& na) -> std::vector<py::object> {
             auto shape = na.Shape();
             std::vector<py::object> arr;
             if (shape == nullptr || shape->dim_size() == 0) {
@@ -1914,9 +1910,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
                 arr[i] = py::none();
               }
             }
-            return arr;
-          },
-          "node shape (assuming the node holds a tensor)");
+            return arr; }, "node shape (assuming the node holds a tensor)");
 
   py::class_<SessionObjectInitializer> sessionObjectInitializer(m, "SessionObjectInitializer");
   py::class_<PyInferenceSession>(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc")
@@ -2107,51 +2101,28 @@ including arg name, arg type (contains both type and shape).)pbdoc")
       .def_property_readonly("get_profiling_start_time_ns", [](const PyInferenceSession* sess) -> uint64_t {
         return sess->GetSessionHandle()->GetProfiling().GetStartTimeNs();
       })
-      .def(
-          "get_providers", [](const PyInferenceSession* sess) -> const std::vector<std::string>& {
-            return sess->GetSessionHandle()->GetRegisteredProviderTypes();
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& {
-            return sess->GetSessionHandle()->GetAllProviderOptions();
-          },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "session_options", [](const PyInferenceSession* sess) -> PySessionOptions* {
+      .def("get_providers", [](const PyInferenceSession* sess) -> const std::vector<std::string>& { return sess->GetSessionHandle()->GetRegisteredProviderTypes(); }, py::return_value_policy::reference_internal)
+      .def("get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& { return sess->GetSessionHandle()->GetAllProviderOptions(); }, py::return_value_policy::reference_internal)
+      .def_property_readonly("session_options", [](const PyInferenceSession* sess) -> PySessionOptions* {
             auto session_options = std::make_unique<PySessionOptions>();
             session_options->value = sess->GetSessionHandle()->GetSessionOptions();
-            return session_options.release();
-          },
-          py::return_value_policy::take_ownership)
-      .def_property_readonly(
-          "inputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
+            return session_options.release(); }, py::return_value_policy::take_ownership)
+      .def_property_readonly("inputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
             auto res = sess->GetSessionHandle()->GetModelInputs();
             OrtPybindThrowIfError(res.first);
-            return *(res.second);
-          },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "outputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
+            return *(res.second); }, py::return_value_policy::reference_internal)
+      .def_property_readonly("outputs_meta", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
             auto res = sess->GetSessionHandle()->GetModelOutputs();
             OrtPybindThrowIfError(res.first);
-            return *(res.second);
-          },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
+            return *(res.second); }, py::return_value_policy::reference_internal)
+      .def_property_readonly("overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector<const onnxruntime::NodeArg*>& {
             auto res = sess->GetSessionHandle()->GetOverridableInitializers();
             OrtPybindThrowIfError(res.first);
-            return *(res.second);
-          },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& {
+            return *(res.second); }, py::return_value_policy::reference_internal)
+      .def_property_readonly("model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& {
             auto res = sess->GetSessionHandle()->GetModelMetadata();
             OrtPybindThrowIfError(res.first);
-            return *(res.second);
-          },
-          py::return_value_policy::reference_internal)
+            return *(res.second); }, py::return_value_policy::reference_internal)
       .def("run_with_iobinding", [](PyInferenceSession* sess, SessionIOBinding& io_binding, RunOptions* run_options = nullptr) -> void {
         Status status;
         // release GIL to allow multiple python threads to invoke Run() in parallel.
@@ -2161,8 +2132,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
         else
           status = sess->GetSessionHandle()->Run(*run_options, *io_binding.Get());
         if (!status.IsOK())
-          throw std::runtime_error("Error in execution: " + status.ErrorMessage());
-      })
+          throw std::runtime_error("Error in execution: " + status.ErrorMessage()); })
       .def("get_tuning_results", [](PyInferenceSession* sess) -> py::list {
 #if !defined(ORT_MINIMAL_BUILD)
         auto results = sess->GetSessionHandle()->GetTuningResults();
@@ -2177,8 +2147,8 @@ including arg name, arg type (contains both type and shape).)pbdoc")
 
         return ret;
 #else
-            ORT_UNUSED_PARAMETER(sess);
-            ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
+        ORT_UNUSED_PARAMETER(sess);
+        ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
 #endif
       })
       .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void {
@@ -2209,10 +2179,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           throw std::runtime_error("Error in execution: " + status.ErrorMessage());
         }
 #else
-            ORT_UNUSED_PARAMETER(sess);
-            ORT_UNUSED_PARAMETER(results);
-            ORT_UNUSED_PARAMETER(error_on_invalid);
-            ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
+        ORT_UNUSED_PARAMETER(sess);
+        ORT_UNUSED_PARAMETER(results);
+        ORT_UNUSED_PARAMETER(error_on_invalid);
+        ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
 #endif
       });
 
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 10d9f469863c4..81e6461e4417f 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -24,8 +24,7 @@ def check_distro_info():
 
         if __my_distro_ver__ not in ["10", "11"]:
             warnings.warn(
-                "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
-                % __my_distro_ver__
+                f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only."
             )
     elif __my_system__ == "linux":
         """Although the 'platform' python module for getting Distro information works well on standard OS images
@@ -54,11 +53,11 @@ def check_distro_info():
 
         if int(__my_distro_ver__.split(".")[0]) < 11:
             warnings.warn(
-                "Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__)
+                f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later."
             )
     else:
         warnings.warn(
-            "Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__
+            f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only."
         )
 
 
@@ -115,10 +114,10 @@ def validate_build_package_info():
                     cudart_version = None
 
                 def print_build_package_info():
-                    warnings.warn("onnxruntime training package info: package_name: %s" % package_name)
-                    warnings.warn("onnxruntime training package info: __version__: %s" % version)
-                    warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version)
-                    warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version)
+                    warnings.warn(f"onnxruntime training package info: package_name: {package_name}")
+                    warnings.warn(f"onnxruntime training package info: __version__: {version}")
+                    warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}")
+                    warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}")
 
                 # collection cuda library info from current environment.
                 from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions
@@ -127,7 +126,7 @@ def print_build_package_info():
                 if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions:
                     print_build_package_info()
                     warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info")
-                    warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions)
+                    warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}")
             else:
                 # TODO: rcom
                 pass
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index aeb78f03dd721..d8cf3c1304219 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -22,7 +22,7 @@
 
 
 def _reg(symbolic_fn: typing.Callable):
-    name = "::%s" % symbolic_fn.__name__
+    name = f"::{symbolic_fn.__name__}"
     torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION)
     _registered_ops.add(name)
 
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 65875d09102bd..703accbcc1c48 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -1076,7 +1076,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
         for i in range(num_half_quantized_bin, zero_bin_index + 1, 1):
             start_index = zero_bin_index - i
-            end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
+            end_index = min(zero_bin_index + i + 1, num_bins)
 
             thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index ac959d5c061f7..f88011c7a2cf9 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -24,7 +24,7 @@ def get_attribute(node, attr_name, default_value=None):
 
 
 def get_dim_from_proto(dim):
-    return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None  # noqa: E721
+    return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None
 
 
 def is_sequence(type_proto):
@@ -92,19 +92,19 @@ def get_opset(mp, domain=None):
 
 
 def as_scalar(x):
-    if type(x) == list:  # noqa: E721
+    if type(x) is list:
         assert len(x) == 1
         return x[0]
-    elif type(x) == np.ndarray:
+    elif type(x) is np.ndarray:
         return x.item()
     else:
         return x
 
 
 def as_list(x, keep_none):
-    if type(x) == list:  # noqa: E721
+    if type(x) is list:
         return x
-    elif type(x) == np.ndarray:
+    elif type(x) is np.ndarray:
         return list(x)
     elif keep_none and x is None:
         return None
@@ -113,7 +113,7 @@ def as_list(x, keep_none):
 
 
 def sympy_reduce_product(x):
-    if type(x) == list:  # noqa: E721
+    if type(x) is list:
         value = sympy.Integer(1)
         for v in x:
             value = value * v
@@ -258,7 +258,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
         self.prefix_ = prefix
 
     def _add_suggested_merge(self, symbols, apply=False):
-        assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])  # noqa: E721
+        assert all([(type(s) is str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
         symbols = set(symbols)
         for k, v in self.suggested_merge_.items():
             if k in symbols:
@@ -278,7 +278,7 @@ def _add_suggested_merge(self, symbols, apply=False):
                     break
         if map_to is None:
             for s in symbols:
-                if type(self.symbolic_dims_[s]) == sympy.Symbol:
+                if type(self.symbolic_dims_[s]) is sympy.Symbol:
                     map_to = s
                     break
         # when nothing to map to, use the shorter one
@@ -328,7 +328,7 @@ def _preprocess(self, in_mp):
         )
 
     def _merge_symbols(self, dims):
-        if not all([type(d) == str for d in dims]):  # noqa: E721
+        if not all([type(d) is str for d in dims]):
             if self.auto_merge_:
                 unique_dims = list(set(dims))
                 is_int = [is_literal(d) for d in unique_dims]
@@ -408,7 +408,7 @@ def _get_shape_rank(self, node, idx):
     def _get_sympy_shape(self, node, idx):
         sympy_shape = []
         for d in self._get_shape(node, idx):
-            if type(d) == str:  # noqa: E721
+            if type(d) is str:
                 sympy_shape.append(
                     self.symbolic_dims_[d]
                     if d in self.symbolic_dims_
@@ -590,7 +590,7 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         # for new symbolic dims from subgraph output, add to main graph symbolic dims
         subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
         subgraph_new_symbolic_dims = {
-            d for s in subgraph_shapes if s for d in s if type(d) == str and d not in self.symbolic_dims_  # noqa: E721
+            d for s in subgraph_shapes if s for d in s if type(d) is str and d not in self.symbolic_dims_
         }
         new_dims = {}
         for d in subgraph_new_symbolic_dims:
@@ -610,7 +610,7 @@ def int_or_float(value, allow_float_values):
         if all([v is not None for v in values]):
             # some shape compute is in floating point, cast to int for sympy
             for i, v in enumerate(values):
-                if type(v) != np.ndarray:
+                if type(v) is not np.ndarray:
                     continue
                 if len(v.shape) > 1:
                     new_v = None  # ignore value for rank > 1
@@ -924,7 +924,7 @@ def _infer_Concat(self, node):  # noqa: N802
             if all([d == dims[0] for d in dims]):
                 continue
             merged = self._merge_symbols(dims)
-            if type(merged) == str:  # noqa: E721
+            if type(merged) is str:
                 sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
             else:
                 sympy_shape[d] = merged
@@ -1060,7 +1060,7 @@ def _infer_Einsum(self, node):  # noqa: N802
                     dim = shape[-i]
                     if letter not in letter_to_dim:
                         letter_to_dim[letter] = dim
-                    elif type(dim) != sympy.Symbol:
+                    elif type(dim) is not sympy.Symbol:
                         letter_to_dim[letter] = dim
             num_operands = num_operands + 1
 
@@ -1127,8 +1127,8 @@ def _infer_Gather(self, node):  # noqa: N802
             idx = self._try_get_value(node, 1)
             if idx is not None:
                 data = self.sympy_data_[node.input[0]]
-                if type(data) == list:  # noqa: E721
-                    if type(idx) == np.ndarray and len(idx.shape) == 1:
+                if type(data) is list:
+                    if type(idx) is np.ndarray and len(idx.shape) == 1:
                         self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
                     else:
                         self.sympy_data_[node.output[0]] = data[int(idx)]
@@ -1530,7 +1530,7 @@ def _infer_aten_upsample(self, node):
             new_shape = input_shape[:2]
             output_size = self._try_get_value(node, 1)
             if output_size is not None:
-                new_shape += [dim_size.item() if type(dim_size) == np.int64 else dim_size for dim_size in output_size]
+                new_shape += [dim_size.item() if type(dim_size) is np.int64 else dim_size for dim_size in output_size]
             else:
                 rank = len(input_shape)
                 new_shape += [str(self._new_symbolic_dim_from_output(node, 0, i)) for i in range(2, rank)]
@@ -1645,7 +1645,7 @@ def _infer_Reshape(self, node):  # noqa: N802
             deferred_dim_idx = -1
             non_deferred_size = 1
             for i, d in enumerate(shape_value):
-                if type(d) == sympy.Symbol:
+                if type(d) is sympy.Symbol:
                     new_sympy_shape.append(d)
                 elif d == 0:
                     new_sympy_shape.append(input_sympy_shape[i])
@@ -1940,7 +1940,7 @@ def handle_negative_index(index, bound):
         # handle sympy_data if needed, for slice in shape computation
         if (
             node.input[0] in self.sympy_data_
-            and [0] == axes
+            and axes == [0]
             and starts is not None
             and len(starts) == 1
             and ends is not None
@@ -1949,8 +1949,8 @@ def handle_negative_index(index, bound):
             and len(steps) == 1
         ):
             input_sympy_data = self.sympy_data_[node.input[0]]
-            if type(input_sympy_data) == list or (  # noqa: E721
-                type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
+            if type(input_sympy_data) is list or (
+                type(input_sympy_data) is np.array and len(input_sympy_data.shape) == 1
             ):
                 self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
 
@@ -2616,7 +2616,7 @@ def _infer_impl(self, start_sympy_data=None):
                     # some models use None for symbolic dim in input, replace it with a string
                     input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim))
 
-            self.input_symbols_.update([d for d in input_shape if type(d) == str])  # noqa: E721
+            self.input_symbols_.update([d for d in input_shape if type(d) is str])
 
         for s in self.input_symbols_:
             if s in self.suggested_merge_:
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 8af074f24acc9..4fa5d0c0ea034 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -925,8 +925,8 @@ def find_model_path(path):
 
     logger.info(target_model_path)
     if len(target_model_path) > 1:
-        logger.error("We expect to find only one model in " + path)  # noqa: G003
-        raise
+        logger.error("We expect to find only one model in %s", path)
+        raise RuntimeError
 
     return target_model_path[0]
 
@@ -1007,7 +1007,7 @@ def parse_models_info_from_file(root_dir, path, models):
                 models[row["model_name"]] = {}
             else:
                 logger.error("Model name must be provided in models_info.json")
-                raise
+                raise RuntimeError
 
             model = models[row["model_name"]]
 
@@ -1018,19 +1018,19 @@ def parse_models_info_from_file(root_dir, path, models):
                     model["working_directory"] = os.path.join(root_working_directory, row["working_directory"])
             else:
                 logger.error("Model path must be provided in models_info.json")
-                raise
+                raise RuntimeError
 
             if "model_path" in row:
                 model["model_path"] = row["model_path"]
             else:
                 logger.error("Model path must be provided in models_info.json")
-                raise
+                raise RuntimeError
 
             if "test_data_path" in row:
                 model["test_data_path"] = row["test_data_path"]
             else:
                 logger.error("Test data path must be provided in models_info.json")
-                raise
+                raise RuntimeError
 
             if "model_path_fp16" in row:
                 model["model_path_fp16"] = row["model_path_fp16"]
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index c447bf9cffe27..0d0f7cc48f361 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -234,7 +234,7 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
 
     if total_ops == 0:
         print("Error ...")
-        raise
+        raise RuntimeError
 
     if len(trt_op_map) == 0:
         total_cuda_and_cpu_ops = total_ops
diff --git a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
index 4f763ad84426d..0532dd7c72c1c 100644
--- a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
+++ b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
@@ -71,7 +71,7 @@ def write_json(models):
 def main():
     links = []
     with open("links.txt") as fh:
-        links = [link.rstrip() for link in fh.readlines()]
+        links = [link.rstrip() for link in fh]
 
     model_list = []
     for link in links:
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 5ec2ab4e50799..4800c48744236 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -802,7 +802,7 @@ def main():
         try:
             os.mkdir(args.cache_dir)
         except OSError:
-            logger.error("Creation of the directory %s failed" % args.cache_dir)  # noqa: G002
+            logger.error("Creation of the directory %s failed", args.cache_dir)
 
     enable_torch = "torch" in args.engines
     enable_torch2 = "torch2" in args.engines
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index aa82e047df328..167fc8697ce06 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -168,11 +168,11 @@ def output_test_data(directory: str, inputs: Dict[str, np.ndarray]):
         try:
             os.mkdir(directory)
         except OSError:
-            print("Creation of the directory %s failed" % directory)
+            print(f"Creation of the directory {directory} failed")
         else:
-            print("Successfully created the directory %s " % directory)
+            print(f"Successfully created the directory {directory} ")
     else:
-        print("Warning: directory %s existed. Files will be overwritten." % directory)
+        print(f"Warning: directory {directory} existed. Files will be overwritten.")
 
     for index, (name, data) in enumerate(inputs.items()):
         tensor = numpy_helper.from_array(data, name)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index f48cabd25fc5c..dc2b38f3928ac 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -672,7 +672,7 @@ def create_multihead_attention_node(
                 q_matmul, k_matmul, v_matmul, q_add, k_add, v_add, num_heads
             )
             mha_inputs.extend([q_slice.output[0], k_slice.output[0], v_slice.output[0]])
-        elif type(k_matmul) == NodeProto and type(v_matmul) == NodeProto:
+        elif type(k_matmul) is NodeProto and type(v_matmul) is NodeProto:
             if self.disable_multi_head_attention_bias:
                 mha_inputs.extend([q_add.output[0], k_matmul.output[0], v_add.output[0]])
             else:
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index 726c587ff7043..dbd9e828198ca 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -159,7 +159,7 @@ def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
             tensor (TensorProto): transposed tensor
         """
         if not isinstance(tensor, onnx_proto.TensorProto):
-            raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+            raise ValueError(f"Expected input type is an ONNX TensorProto but got {type(tensor)}")
 
         if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
             raise ValueError("Only INT8 2-D tensors can be transposed")
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
index 111520a6e3aeb..8a610fb17671b 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
@@ -205,5 +205,5 @@ def export_encoder(args):
             no_repeat_ngram_size=no_repeat_ngram_size,
         )
         time_cost = time.time() - start_time
-        print("--- %s seconds ---" % (time_cost))
+        print(f"--- {time_cost} seconds ---")
         print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
index 29c39730c79ef..afd01ae9d025f 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
@@ -266,5 +266,5 @@ def export_decoder(args):
             use_cache=True,
         )
         time_cost = time.time() - start_time
-        print("--- %s seconds ---" % (time_cost))
+        print(f"--- {time_cost} seconds ---")
         print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
index c4c8a2dcba697..7a5cfe42c7efe 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
@@ -49,7 +49,7 @@ def run_inference(args):
             no_repeat_ngram_size=no_repeat_ngram_size,
         )
         time_cost = time.time() - start_time
-        print("--- %s seconds ---" % (time_cost))
+        print(f"--- {time_cost} seconds ---")
         for j in range(batch_num):
             for i in range(beam):
                 print(
@@ -81,7 +81,7 @@ def run_inference(args):
         start_time = time.time()
         out = sess.run(None, ort_inputs)
         time_cost = time.time() - start_time
-        print("--- %s seconds ---" % (time_cost))
+        print(f"--- {time_cost} seconds ---")
         for j in range(batch_num):
             for i in range(beam):
                 print(
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
index 26b9a2792e9e1..0b6d325803554 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
@@ -117,7 +117,7 @@ def get_cached_model_name(self, model_name):
             model_name = model_name + "_" + "_".join(self.pipeline_info.controlnet)
 
         if hash_source:
-            model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).digest().hex()[:8]
+            model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).hexdigest()[:8]
 
         # TODO: When we support original VAE, we shall save custom VAE to another directory.
 
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
index 1629537dc294f..522cc541c1e57 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py
@@ -459,9 +459,9 @@ def denoise_latent(
                 noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                 noise_pred = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond)
 
-            if type(self.scheduler) == UniPCMultistepScheduler:
+            if type(self.scheduler) is UniPCMultistepScheduler:
                 latents = self.scheduler.step(noise_pred, timestep, latents, return_dict=False)[0]
-            elif type(self.scheduler) == LCMScheduler:
+            elif type(self.scheduler) is LCMScheduler:
                 latents = self.scheduler.step(noise_pred, timestep, latents, generator=self.generator)[0]
             else:
                 latents = self.scheduler.step(noise_pred, latents, step_offset + step_index, timestep)
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 26e40b25930c8..4e9e80b180e9c 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1883,7 +1883,7 @@ TEST_F(PlannerTest, ParaPlanCreation) {
       ORT_ENFORCE(main_graph_ort_value_index_map.GetName(per_value_plan.reused_buffer, reused).IsOK());
       reuse_pairs.erase(reused);
     }  // if
-  }    // for
+  }  // for
   ASSERT_TRUE(reuse_pairs.empty());
 }
 
diff --git a/onnxruntime/test/onnx/OrtValueList.h b/onnxruntime/test/onnx/OrtValueList.h
index 2929cdca428d9..921c1d3872111 100644
--- a/onnxruntime/test/onnx/OrtValueList.h
+++ b/onnxruntime/test/onnx/OrtValueList.h
@@ -14,7 +14,7 @@ class OrtValueArray {
  public:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtValueArray);
   // n must be non-negative
-  OrtValueArray(int n) : values(static_cast<size_t>(n), nullptr){};
+  OrtValueArray(int n) : values(static_cast<size_t>(n), nullptr) {};
   ~OrtValueArray() {
     for (OrtValue* v : values) {
       if (v != nullptr) Ort::GetApi().ReleaseValue(v);
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index 69ee72996365e..df36135bd3017 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -27,7 +27,7 @@ class Allocs : public IExecutionProvider {
   std::shared_ptr<CPUAllocator> alloc = std::make_shared<CPUAllocator>();
 
  public:
-  Allocs() : IExecutionProvider("fake"){};
+  Allocs() : IExecutionProvider("fake") {};
   AllocatorPtr GetAllocator(OrtMemType) const {
     return alloc;
   }
diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
index ec9f78da14a75..ccfa1f1159937 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc
@@ -401,7 +401,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one
     auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
-      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
+      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>) && (!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
           (std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
         EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
         EXPECT_EQ(op_to_count["Gemm"], 0);
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 1638851daf65a..14c5b60d6e0bd 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -786,7 +786,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one
     auto check_binary_op_graph = [&](InferenceSessionWrapper& session) {
       auto op_to_count = CountOpsInGraph(session.GetGraph());
       const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq);
-      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>)&&(!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
+      if ((!has_output_q || std::is_same_v<Input1Type, OutputType>) && (!has_bias || (std::is_same_v<BiasType, int32_t> && !beta_not_one)) &&
           (std::is_same_v<Input1Type, uint8_t> || std::is_same_v<Input2Type, int8_t>)) {
         EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1);
         EXPECT_EQ(op_to_count["Gemm"], 0);
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 568a4649f3977..bd06ae9fe881a 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -40,13 +40,13 @@ def TestReduction(op, data, axes, keepdims):  # noqa: N802
 
 
 def PrintResult(op, axes, keepdims, res):  # noqa: N802
-    print('  {"%s",' % op)
+    print(f'  {{"{op}",')
     print("OpAttributesResult(")
     print("    // ReductionAttribute")
     print("      {")
     print(" // axes_")
     print("{", end="")
-    print(*axes, sep=", ", end="") if axes else print("")
+    print(*axes, sep=", ", end="") if axes else print()
     print("},")
     print(" // keep_dims_")
     print(keepdims, ",")
@@ -60,7 +60,7 @@ def PrintResult(op, axes, keepdims, res):  # noqa: N802
     print(" // expected values")
     print("{", end="")
     for i in range(res.size):
-        print("%5.6ff," % res.item(i))
+        print(f"{res.item(i):5.6f}f,")
 
     print("})},")
 
@@ -130,7 +130,7 @@ def PrintReenableOptimizations():  # noqa: N802
     print("{")
     for i in range(input_data.size):
         print(
-            "%5.6ff," % input_data.item(i),
+            f"{input_data.item(i):5.6f}f,",
         )
     print("},")
     print("// input_dims")
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 5fc8ed417391e..1d9cd15f53327 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -66,13 +66,13 @@ static void RunAllOpsetAllDomainPadTests(
     bool pads_is_initializer;
     bool value_is_initializer;
   };
-  const std::vector<TestParams> all_test_params {
-    {false, false},
+  const std::vector<TestParams> all_test_params{
+      {false, false},
 #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__))
-        // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
-        // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
-        // TODO try to enable when we move from QEMU to arm64 CI machines
-        {true, true},
+      // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment
+      // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise
+      // TODO try to enable when we move from QEMU to arm64 CI machines
+      {true, true},
 #endif
   };
   for (const auto& test_params : all_test_params) {
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 9489d354755e4..9d19c36dc94b2 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -835,14 +835,14 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) {
 
 // Test that models run with various SoC model values
 TEST_F(QnnHTPBackendTests, HTPSocModels) {
-  constexpr std::array<const char*, 3> soc_models = { "",   // No explicit SoC model specified
-                                                      "0",  // "Unknown"
+  constexpr std::array<const char*, 3> soc_models = {"",   // No explicit SoC model specified
+                                                     "0",  // "Unknown"
 #if defined(_M_ARM64)
-                                                      "37" };  // SC8280X
+                                                     "37"};  // SC8280X
 #elif defined(__linux__)
-                                                      "30" };  // SM8350
+                                                     "30"};  // SM8350
 #else
-                                                      "" };
+                                                     ""};
 #endif
 
   for (auto soc_model : soc_models) {
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 6eebc996fde9c..9b1e87f6ec02e 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -76,7 +76,7 @@ def apply_filters(filters, category):
     opset_version = f"opset{onnx.defs.onnx_opset_version()}"
     validated_filters = []
     for f in filters[category]:
-        if type(f) is list:  # noqa: E721
+        if type(f) is list:
             opset_regex = f[0]
             filter_regex = f[1]
             opset_match = re.match(opset_regex, opset_version)
diff --git a/onnxruntime/test/python/transformers/rotary_flash.py b/onnxruntime/test/python/transformers/rotary_flash.py
index 42bff9c92b41b..4329b2c1a6057 100644
--- a/onnxruntime/test/python/transformers/rotary_flash.py
+++ b/onnxruntime/test/python/transformers/rotary_flash.py
@@ -486,9 +486,6 @@ def backward(ctx, dkv):
         return dkv, None, None, None, None
 
 
-apply_rotary_emb_kv_ = ApplyRotaryEmbKV.apply
-
-
 def apply_rotary_emb_kv_(
     kv,
     cos,
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index 0086ce0d289c7..c1e95f35a633b 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -343,9 +343,9 @@ def generate_test_data(
         try:
             os.mkdir(path)
         except OSError:
-            print("Creation of the directory %s failed" % path)
+            print(f"Creation of the directory {path} failed")
         else:
-            print("Successfully created the directory %s " % path)
+            print(f"Successfully created the directory {path} ")
 
         if input_tensor_only:
             return
diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 065783d5812a8..4a4a0bc2c5098 100644
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -452,9 +452,9 @@ def generate_test_data(
         try:
             os.mkdir(path)
         except OSError:
-            print("Creation of the directory %s failed" % path)
+            print(f"Creation of the directory {path} failed")
         else:
-            print("Successfully created the directory %s " % path)
+            print(f"Successfully created the directory {path} ")
 
         sess_options = onnxruntime.SessionOptions()
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
diff --git a/onnxruntime/test/shared_lib/custom_op_utils.h b/onnxruntime/test/shared_lib/custom_op_utils.h
index 8ead4056b1b54..e11540aaa5691 100644
--- a/onnxruntime/test/shared_lib/custom_op_utils.h
+++ b/onnxruntime/test/shared_lib/custom_op_utils.h
@@ -381,9 +381,9 @@ struct StandaloneCustomOp : Ort::CustomOpBase<StandaloneCustomOp, StandaloneCust
 /////////////// structures to test multi-kernls-single-schema ///////////////
 
 struct MulTopKernelFloat {
-  MulTopKernelFloat(const OrtKernelInfo*){};
+  MulTopKernelFloat(const OrtKernelInfo*) {};
   ~MulTopKernelFloat() = default;
-  void Compute(OrtKernelContext*){};
+  void Compute(OrtKernelContext*) {};
 };
 
 struct MulTopOpFloat : Ort::CustomOpBase<MulTopOpFloat, MulTopKernelFloat> {
@@ -397,9 +397,9 @@ struct MulTopOpFloat : Ort::CustomOpBase<MulTopOpFloat, MulTopKernelFloat> {
 };
 
 struct MulTopKernelInt32 {
-  MulTopKernelInt32(const OrtKernelInfo*){};
+  MulTopKernelInt32(const OrtKernelInfo*) {};
   ~MulTopKernelInt32() = default;
-  void Compute(OrtKernelContext*){};
+  void Compute(OrtKernelContext*) {};
 };
 
 struct MulTopOpInt32 : Ort::CustomOpBase<MulTopOpInt32, MulTopKernelInt32> {
@@ -413,9 +413,9 @@ struct MulTopOpInt32 : Ort::CustomOpBase<MulTopOpInt32, MulTopKernelInt32> {
 };
 
 struct MulTopKernelDouble {
-  MulTopKernelDouble(const OrtKernelInfo*){};
+  MulTopKernelDouble(const OrtKernelInfo*) {};
   ~MulTopKernelDouble() = default;
-  void Compute(OrtKernelContext*){};
+  void Compute(OrtKernelContext*) {};
 };
 
 // MulTopOpDouble and MulTopOpFloat has input count mismatch
@@ -430,9 +430,9 @@ struct MulTopOpDouble : Ort::CustomOpBase<MulTopOpDouble, MulTopKernelDouble> {
 };
 
 struct MulTopKernelInt16 {
-  MulTopKernelInt16(const OrtKernelInfo*){};
+  MulTopKernelInt16(const OrtKernelInfo*) {};
   ~MulTopKernelInt16() = default;
-  void Compute(OrtKernelContext*){};
+  void Compute(OrtKernelContext*) {};
 };
 
 // MulTopOpInt16 and MulTopOpFloat has output count mismatch
@@ -448,9 +448,9 @@ struct MulTopOpInt16 : Ort::CustomOpBase<MulTopOpInt16, MulTopKernelInt16> {
 
 // MulTopKernelFloat16 and MulTopOpFloat has input characteristic mismatch
 struct MulTopKernelFloat16 {
-  MulTopKernelFloat16(const OrtKernelInfo*){};
+  MulTopKernelFloat16(const OrtKernelInfo*) {};
   ~MulTopKernelFloat16() = default;
-  void Compute(OrtKernelContext*){};
+  void Compute(OrtKernelContext*) {};
 };
 
 struct MulTopOpFloat16 : Ort::CustomOpBase<MulTopOpFloat16, MulTopKernelFloat16> {
diff --git a/onnxruntime/test/testdata/CNTK/gen.py b/onnxruntime/test/testdata/CNTK/gen.py
index 37241a46808b5..5a3ca461f471a 100644
--- a/onnxruntime/test/testdata/CNTK/gen.py
+++ b/onnxruntime/test/testdata/CNTK/gen.py
@@ -48,10 +48,10 @@ def Save(dir, func, feed, outputs):  # noqa: N802
             if actual_input_name.startswith(cntk_name):
                 cntk_to_actual_names[cntk_name] = actual_input_name
 
-    if type(feed) is not dict:  # noqa: E721
+    if type(feed) is not dict:
         feed = {func.arguments[0]: feed}
 
-    if type(outputs) is not dict:  # noqa: E721
+    if type(outputs) is not dict:
         outputs = {func.outputs[0]: outputs}
 
     test_data_dir = os.path.join(dir, data_dir)
diff --git a/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc b/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc
index 805de812cfa65..dc812ee2aec3f 100644
--- a/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc
+++ b/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc
@@ -35,8 +35,7 @@ void AdasumMPI::InitializeVHDDReductionComms(WorkerGroupType worker_group) {
   int nearest_power_2 = 1;
   int log_size;
   for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size;
-       nearest_power_2 = (nearest_power_2 << 1), log_size++)
-    ;
+       nearest_power_2 = (nearest_power_2 << 1), log_size++);
   int shift_val;
   int level;
   reduction_comms_ = std::make_unique<std::vector<MPI_Comm>>();
diff --git a/orttraining/orttraining/core/framework/pipeline.h b/orttraining/orttraining/core/framework/pipeline.h
index a93ba1081d7df..79701106c9c1d 100644
--- a/orttraining/orttraining/core/framework/pipeline.h
+++ b/orttraining/orttraining/core/framework/pipeline.h
@@ -247,7 +247,7 @@ struct PipelineWorkerState {
 
 struct PipelineWorkerPool {
   PipelineWorkerPool() = default;
-  PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers){};
+  PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers) {};
   void Join(size_t worker_id);
   void JoinAll();
 
diff --git a/orttraining/orttraining/core/framework/torch/custom_function_register.h b/orttraining/orttraining/core/framework/torch/custom_function_register.h
index 762258a45221e..ddb838ba6475c 100644
--- a/orttraining/orttraining/core/framework/torch/custom_function_register.h
+++ b/orttraining/orttraining/core/framework/torch/custom_function_register.h
@@ -102,7 +102,7 @@ class OrtTorchFunctionPool final {
   void UnRegisterFunctions();
 
  private:
-  OrtTorchFunctionPool(){};
+  OrtTorchFunctionPool() {};
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtTorchFunctionPool);
 
   void UnRegisterGlobalFunctions();
diff --git a/orttraining/orttraining/core/framework/torch/gil.h b/orttraining/orttraining/core/framework/torch/gil.h
index c928571d2024a..b14b062785eef 100644
--- a/orttraining/orttraining/core/framework/torch/gil.h
+++ b/orttraining/orttraining/core/framework/torch/gil.h
@@ -13,7 +13,7 @@
 // See https://docs.python.org/3/c-api/init.html#non-python-created-threads for details.
 class GilGuard {
  public:
-  GilGuard() : state_(PyGILState_Ensure()){};
+  GilGuard() : state_(PyGILState_Ensure()) {};
   ~GilGuard() { PyGILState_Release(state_); };
 
  private:
diff --git a/orttraining/orttraining/core/framework/torch/torch_proxy.h b/orttraining/orttraining/core/framework/torch/torch_proxy.h
index b80acd6c4791a..37766e67ef42f 100644
--- a/orttraining/orttraining/core/framework/torch/torch_proxy.h
+++ b/orttraining/orttraining/core/framework/torch/torch_proxy.h
@@ -95,8 +95,8 @@ class TorchProxy {
       std::vector<int64_t>& bw_output_to_input_alias_map);
 
  private:
-  TorchProxy(){};
-  ~TorchProxy(){};
+  TorchProxy() {};
+  ~TorchProxy() {};
 
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TorchProxy);
 
diff --git a/orttraining/orttraining/core/graph/graph_augmenter.h b/orttraining/orttraining/core/graph/graph_augmenter.h
index eb146ca0e84f3..c3b6d227f01fd 100644
--- a/orttraining/orttraining/core/graph/graph_augmenter.h
+++ b/orttraining/orttraining/core/graph/graph_augmenter.h
@@ -33,7 +33,7 @@ struct OpDef {
   OpDef(const std::string& type, const std::string& domain = kOnnxDomain, const int opset_version = 9)
       : type(type),
         domain(domain),
-        opset_version(opset_version){};
+        opset_version(opset_version) {};
 
   std::string type;
   std::string domain;
@@ -52,7 +52,7 @@ struct NodeDef {
                               output_args(output_args),
                               attributes(attributes),
                               name(name),
-                              priority(priority){};
+                              priority(priority) {};
 
   NodeDef(const std::string& op_type,
           const std::vector<ArgDef>& input_args,
@@ -64,7 +64,7 @@ struct NodeDef {
                               output_args(output_args),
                               attributes(attributes),
                               name(name),
-                              priority(priority){};
+                              priority(priority) {};
 
   NodeDef(const OpDef& op_def,
           const std::vector<ArgDef>& input_args,
diff --git a/orttraining/orttraining/core/graph/loss_func/loss_func_common.h b/orttraining/orttraining/core/graph/loss_func/loss_func_common.h
index 2b60280e076aa..61bc0a094dac4 100644
--- a/orttraining/orttraining/core/graph/loss_func/loss_func_common.h
+++ b/orttraining/orttraining/core/graph/loss_func/loss_func_common.h
@@ -21,7 +21,7 @@ struct LossFunctionInfo {
 
 struct ILossFunction {
   virtual GraphAugmenter::GraphDefs operator()(const Graph& graph, const LossFunctionInfo& loss_func_info) = 0;
-  virtual ~ILossFunction(){};
+  virtual ~ILossFunction() {};
 };
 
 TypeProto* GetSparseTypeProto(const NodeArg* input_arg,
diff --git a/orttraining/orttraining/core/graph/pipeline_transformer.cc b/orttraining/orttraining/core/graph/pipeline_transformer.cc
index f989d53aa85d5..3495c3da72e3d 100644
--- a/orttraining/orttraining/core/graph/pipeline_transformer.cc
+++ b/orttraining/orttraining/core/graph/pipeline_transformer.cc
@@ -887,7 +887,7 @@ struct PipelineStageNodeGroup {
   // the consumer nodes of a particular initializer can be more than one, so we need a vector to store those
   // nodes.
   std::vector<Node*> nodes;
-  PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)){};
+  PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)) {};
 };
 
 // This function passes through the given initializer across stages specified in node_groups[i].stage_id.
diff --git a/orttraining/orttraining/core/optimizer/megatron_transformer.cc b/orttraining/orttraining/core/optimizer/megatron_transformer.cc
index 4ebea5cf386cc..25e16304789b6 100644
--- a/orttraining/orttraining/core/optimizer/megatron_transformer.cc
+++ b/orttraining/orttraining/core/optimizer/megatron_transformer.cc
@@ -21,7 +21,7 @@ struct OpInfo {
          const size_t output_count = 1) : op_type(op_type),
                                           supported_versions(supported_versions),
                                           domain(domain),
-                                          output_count(output_count){};
+                                          output_count(output_count) {};
 
   std::string op_type;
   std::initializer_list<OperatorSetVersion> supported_versions;
@@ -53,7 +53,7 @@ const OpInfo where_info = OpInfo("Where", opset_v9);
 struct NodeInfo {
   NodeInfo(const std::vector<OpInfo>& op_infos,
            const bool required = true) : op_infos(op_infos),
-                                         required(required){};
+                                         required(required) {};
 
   std::vector<OpInfo> op_infos;
   bool required;
diff --git a/orttraining/orttraining/core/session/training_session.h b/orttraining/orttraining/core/session/training_session.h
index 37b708fb7d1dd..765f88e1c992e 100644
--- a/orttraining/orttraining/core/session/training_session.h
+++ b/orttraining/orttraining/core/session/training_session.h
@@ -46,7 +46,7 @@ class TrainingSession : public InferenceSession {
 
   TrainingSession(const SessionOptions& session_options, const Environment& env)
       : InferenceSession(session_options, env), is_mixed_precision_enabled_(false) {}
-  virtual ~TrainingSession(){};
+  virtual ~TrainingSession() {};
 
   /**
    * The training configuration options.
@@ -215,11 +215,11 @@ class TrainingSession : public InferenceSession {
       // If the edge is unique, i.e. only have one consumer node, or all the edges
       // with the same node_arg_name needs to be cut, specify the node_arg_name
       // suffices.
-      CutEdge(std::string edge) : node_arg_name(edge){};
+      CutEdge(std::string edge) : node_arg_name(edge) {};
       // If the edges with same node_arg_name belongs to different cut, i.e. some of its
       // consumer node belongs to one partition, and some belongs to another, specify
       // the consumer node names which you want to perform the cut on.
-      CutEdge(std::string edge, std::vector<std::string> nodes) : node_arg_name(edge), consumer_nodes(nodes){};
+      CutEdge(std::string edge, std::vector<std::string> nodes) : node_arg_name(edge), consumer_nodes(nodes) {};
     };
     // CutInfo is a group of CutEdges that describes a specific cut that composed of splitting those edges.
     typedef std::vector<CutEdge> CutInfo;
diff --git a/orttraining/orttraining/lazy_tensor/flags.h b/orttraining/orttraining/lazy_tensor/flags.h
index b849f9f9a0a3e..1812466d10346 100644
--- a/orttraining/orttraining/lazy_tensor/flags.h
+++ b/orttraining/orttraining/lazy_tensor/flags.h
@@ -60,7 +60,7 @@ class DynamicSettings {
   }
 
  private:
-  DynamicSettings() : onnx_fusion_status_(true){};
+  DynamicSettings() : onnx_fusion_status_(true) {};
   bool onnx_fusion_status_;
 };
 
diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc
index 22cdd9351a206..c4c7a98ba116a 100644
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@@ -861,8 +861,7 @@ int main(int argc, char* argv[]) {
   OrtParameters ort_params{};
   RETURN_IF_FAIL(ParseArguments(argc, argv, params, ort_params));
   bool keep_looping = params.debug_break;
-  while (keep_looping)
-    ;
+  while (keep_looping);
 
   // setup logger, be noted: LOGS_DEFAULT must be after logging manager initialization.
   string default_logger_id{"Default"};
diff --git a/orttraining/orttraining/models/pipeline_poc/main.cc b/orttraining/orttraining/models/pipeline_poc/main.cc
index c461e4bbf3600..1b7d6b9ea26f6 100644
--- a/orttraining/orttraining/models/pipeline_poc/main.cc
+++ b/orttraining/orttraining/models/pipeline_poc/main.cc
@@ -86,36 +86,36 @@ int main(int argc, char* argv[]) {
   // setup onnxruntime env
   std::vector<FreeDimensionOverride> overrides = {};
   SessionOptions so = {
-    ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
-    ExecutionOrder::DEFAULT,           // execution_order
-    false,                             // enable_profiling
-    ORT_TSTR(""),                      // optimized_model_filepath
-    true,                              // enable_mem_pattern
-    true,                              // enable_mem_reuse
-    true,                              // enable_cpu_mem_arena
-    ORT_TSTR("onnxruntime_profile_"),  // profile_file_prefix
-    "",                                // session_logid
-    -1,                                // session_log_severity_level
-    0,                                 // session_log_verbosity_level
-    5,                                 // max_num_graph_transformation_steps
-    TransformerLevel::Level1,          // graph_optimization_level
-    {},                                // intra_op_param
-    {},                                // inter_op_param
-    overrides,                         // free_dimension_overrides
-    true,                              // use_per_session_threads
-    true,                              // thread_pool_allow_spinning
-    false,                             // use_deterministic_compute
-    {},                                // session_configurations
-    {},                                // initializers_to_share_map
+      ExecutionMode::ORT_SEQUENTIAL,     // execution_mode
+      ExecutionOrder::DEFAULT,           // execution_order
+      false,                             // enable_profiling
+      ORT_TSTR(""),                      // optimized_model_filepath
+      true,                              // enable_mem_pattern
+      true,                              // enable_mem_reuse
+      true,                              // enable_cpu_mem_arena
+      ORT_TSTR("onnxruntime_profile_"),  // profile_file_prefix
+      "",                                // session_logid
+      -1,                                // session_log_severity_level
+      0,                                 // session_log_verbosity_level
+      5,                                 // max_num_graph_transformation_steps
+      TransformerLevel::Level1,          // graph_optimization_level
+      {},                                // intra_op_param
+      {},                                // inter_op_param
+      overrides,                         // free_dimension_overrides
+      true,                              // use_per_session_threads
+      true,                              // thread_pool_allow_spinning
+      false,                             // use_deterministic_compute
+      {},                                // session_configurations
+      {},                                // initializers_to_share_map
 #if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_EXTERNAL_INITIALIZERS)
-    {},  // external_initializers
-    {},  // external_initializer_files
+      {},  // external_initializers
+      {},  // external_initializer_files
 #endif
-    nullptr,  // custom_create_thread_fn
-    nullptr,  // custom_thread_creation_options
-    nullptr,  // custom_join_thread_fn
+      nullptr,  // custom_create_thread_fn
+      nullptr,  // custom_thread_creation_options
+      nullptr,  // custom_join_thread_fn
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-    {},  // custom_op_libs
+      {},  // custom_op_libs
 #endif
   };
 
diff --git a/orttraining/orttraining/models/runner/training_util.h b/orttraining/orttraining/models/runner/training_util.h
index 8c76ce7e50dc9..1499b30180f61 100644
--- a/orttraining/orttraining/models/runner/training_util.h
+++ b/orttraining/orttraining/models/runner/training_util.h
@@ -98,7 +98,7 @@ class RandomDataSet : public DataSet {
       : DataSet(tensor_names),
         num_samples_(num_samples),
         tensor_shapes_(tensor_shapes),
-        tensor_types_(tensor_types){};
+        tensor_types_(tensor_types) {};
 
   virtual ~RandomDataSet() {}
 
@@ -189,7 +189,7 @@ class LossScaler {
         min_loss_scale_(min_loss_scale),
         max_loss_scale_(max_loss_scale),
         loss_scale_(loss_scale),
-        stable_steps_(0){};
+        stable_steps_(0) {};
 
   std::string GetLossScaleInputName() const { return loss_scale_input_name_; }
 
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index a81ea76e807ca..b2392b68ac43e 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -319,7 +319,7 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterForwardRunner(function_address);
 #else
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("register_backward_runner", [](py::object obj) -> void {
@@ -328,7 +328,7 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterBackwardRunner(function_address);
 #else
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("register_torch_autograd_function", [](std::string function_full_qual_name, py::object obj) -> void {
@@ -336,8 +336,8 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterTorchAutogradFunction(function_full_qual_name, obj.ptr());
 #else
-        ORT_UNUSED_PARAMETER(function_full_qual_name);
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(function_full_qual_name);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("register_shape_inference_function", [](std::string function_full_qual_name, py::object obj) -> void {
@@ -345,8 +345,8 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterShapeInferenceFunction(function_full_qual_name, obj.ptr());
 #else
-        ORT_UNUSED_PARAMETER(function_full_qual_name);
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(function_full_qual_name);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("get_shape_inference_function", [](std::string function_full_qual_name) -> py::object {
@@ -368,8 +368,8 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterInputAliasFunction(function_full_qual_name, obj.ptr());
 #else
-        ORT_UNUSED_PARAMETER(function_full_qual_name);
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(function_full_qual_name);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("register_miscellaneous_const_input", [](py::object obj) -> void {
@@ -377,7 +377,7 @@ void addObjectMethodsForTraining(py::module& m) {
     auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();
     pool.RegisterMiscellaneousConstInput(obj.ptr());
 #else
-        ORT_UNUSED_PARAMETER(obj);
+    ORT_UNUSED_PARAMETER(obj);
 #endif
   });
   m.def("unregister_python_functions", []() -> void {
@@ -391,14 +391,14 @@ void addObjectMethodsForTraining(py::module& m) {
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
   m.def("is_triton_enabled", []() -> bool {
 #ifdef ENABLE_TRITON
     return true;
 #else
-        return false;
+    return false;
 #endif
   });
 #ifdef ENABLE_TRITON
@@ -1036,7 +1036,7 @@ void addObjectMethodsForTraining(py::module& m) {
 #ifdef __linux__
           return true;
 #else
-        return false;
+    return false;
 #endif
         });
 #endif
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
index a3681a13699a0..1a944082fa4ba 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py
@@ -372,7 +372,7 @@ def _gen_bmm_module(
 ) -> Tuple[str, ModuleType]:
     func_name = gen_unique_name("bmm")
     kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name)
-    batch = batch_a if batch_a >= batch_b else batch_b
+    batch = max(batch_a, batch_b)
     kwargs["stride_aq"] = m * k if batch_a == batch else 0
     kwargs["stride_bq"] = k * n if batch_b == batch else 0
     kwargs["batch"] = batch
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index c299d1c5db4e7..4787cb31a24fd 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -74,7 +74,7 @@ def _ortvalues_to_torch_tensor(
         return tuple(C.to_aten_ort_device_tensor(ov) for ov in ortvalues)
 
     if not isinstance(ortvalues, C.OrtValueVector):
-        raise TypeError("ortvalues must be an instance of OrtValueVector not %r." % type(ortvalues))
+        raise TypeError(f"ortvalues must be an instance of OrtValueVector not {type(ortvalues)!r}.")
 
     res: List[torch.Tensor] = ortvalues.to_dlpacks(_from_dlpack)
     bool_indices = ortvalues.bool_tensor_indices()
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h
index e7b101d987d7a..b62c2c40c30ee 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h
@@ -58,8 +58,8 @@ class PyNodeSharedPointerPool {
   }
 
  private:
-  PyNodeSharedPointerPool(){};
-  ~PyNodeSharedPointerPool(){};
+  PyNodeSharedPointerPool() {};
+  ~PyNodeSharedPointerPool() {};
 
   PyNodeSharedPointerPool(const PyNodeSharedPointerPool&) = delete;
   PyNodeSharedPointerPool& operator=(const PyNodeSharedPointerPool&) = delete;
diff --git a/orttraining/orttraining/test/distributed/partition_utils.h b/orttraining/orttraining/test/distributed/partition_utils.h
index c22d0a3eb2f93..787a001903cce 100644
--- a/orttraining/orttraining/test/distributed/partition_utils.h
+++ b/orttraining/orttraining/test/distributed/partition_utils.h
@@ -159,7 +159,7 @@ struct PipelineStageNodeGroup {
   // the consumer nodes of a particular initializer can be more than one, so we need a vector to store those
   // nodes.
   std::vector<Node*> nodes;
-  PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)){};
+  PipelineStageNodeGroup(const size_t stage, std::vector<Node*>& node_group) : stage_id(stage), nodes(std::move(node_group)) {};
 };
 
 // This function passes through the given initializer across stages specified in node_groups[i].stage_id.
diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
index 8afbafccb8241..655c9def2c66c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
@@ -198,7 +198,7 @@ def call_backward(y):
             y.sum().backward()
 
     def call_allclose(y, y_ref):
-        assert type(y) == type(y_ref)
+        assert type(y) is type(y_ref)
         if isinstance(y, Iterable):
             for ele, ele_ref in zip(y, y_ref):
                 torch.allclose(ele, ele_ref)
diff --git a/orttraining/orttraining/test/python/orttraining_test_model_transform.py b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
index 095830cd54ab8..6ea81fc6aa089 100644
--- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
@@ -77,7 +77,7 @@ def fix_transpose(model):
         weight = numpy_helper.to_array(t[1])
         assert len(weight.shape) == 2
         weight = weight.transpose(perm)
-        new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name)
+        new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed")
         model.graph.initializer.extend([new_weight])
         replace_input_arg(model, node.output[0], new_weight.name)
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index fe59c398d7abb..3615a12705241 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -3976,9 +3976,9 @@ def forward(self, input1, bool_argument, int_argument, float_argument):
                 out = self.relu(out)
             return out
 
-    assert type(bool_argument) is bool  # noqa: E721
-    assert type(int_argument) is int  # noqa: E721
-    assert type(float_argument) is float  # noqa: E721
+    assert type(bool_argument) is bool
+    assert type(int_argument) is int
+    assert type(float_argument) is float
 
     device = "cuda"
     N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
@@ -4014,8 +4014,8 @@ def forward(self, input1, bool_argument):
                 out = self.relu(out)
             return out
 
-    assert type(bool_arguments[0]) is bool  # noqa: E721
-    assert type(bool_arguments[1]) is bool  # noqa: E721
+    assert type(bool_arguments[0]) is bool
+    assert type(bool_arguments[1]) is bool
 
     device = "cuda"
     N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
@@ -5501,7 +5501,7 @@ def forward(self, x):
             return x[: self.dim, :]
 
     def random_state_equal(a, b):
-        assert type(a) == type(b)
+        assert type(a) is type(b)
         if isinstance(a, tuple):
             assert len(a) == len(b)
             return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b)])
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index a1a7d4660f266..41e1e0f5d0d57 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -385,7 +385,7 @@ def main():
     # Set log level
     numeric_level = getattr(logging, args.log_level.upper(), None)
     if not isinstance(numeric_level, int):
-        raise ValueError("Invalid log level: %s" % args.log_level)
+        raise ValueError(f"Invalid log level: {args.log_level}")
     logging.basicConfig(level=numeric_level)
 
     # 2. Dataloader
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 0d5aba1a1a5c4..801eb58727689 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -385,7 +385,7 @@ def main():
     # Set log level
     numeric_level = getattr(logging, args.log_level.upper(), None)
     if not isinstance(numeric_level, int):
-        raise ValueError("Invalid log level: %s" % args.log_level)
+        raise ValueError(f"Invalid log level: {args.log_level}")
     logging.basicConfig(level=numeric_level)
 
     # 2. Dataloader
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 5b28e9c52b480..5e0a4d38b51d6 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -219,7 +219,7 @@ def main():
         }
         log_level = log_level_mapping.get(args.log_level.upper(), None)
         if not isinstance(log_level, LogLevel):
-            raise ValueError("Invalid log level: %s" % args.log_level)
+            raise ValueError(f"Invalid log level: {args.log_level}")
         debug_options = DebugOptions(log_level=log_level, save_onnx=args.export_onnx_graphs, onnx_prefix="MNIST")
 
         model = ORTModule(model, debug_options)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
index e1def2022d63f..537dcd2ccdb09 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
@@ -74,11 +74,11 @@ def run_step(model, x):
         )
         onnx_graph_train = ort_model._torch_module._execution_manager._training_manager._onnx_models.optimized_model
         if debug:
-            with open("debug_%s_ortmodule_infer.onnx" % name, "wb") as f:
+            with open(f"debug_{name}_ortmodule_infer.onnx", "wb") as f:
                 f.write(onnx_graph_inf.SerializeToString())
-            with open("debug_%s_ortmodule_train.onnx" % name, "wb") as f:
+            with open(f"debug_{name}_ortmodule_train.onnx", "wb") as f:
                 f.write(onnx_graph_train.SerializeToString())
-        self.assertIn('op_type: "%s"' % name, str(onnx_graph_inf))
+        self.assertIn(f'op_type: "{name}"', str(onnx_graph_inf))
         for onnx_model in [onnx_graph_inf, onnx_graph_train]:
             for oimp in onnx_model.opset_import:
                 if oimp.domain == "":
@@ -86,10 +86,10 @@ def run_step(model, x):
         if op_grad_type is not None:
             if isinstance(op_grad_type, tuple):
                 text = str(onnx_graph_train)
-                if all(map(lambda op: ('op_type: "%s"' % op) not in text, op_grad_type)):
+                if all(map(lambda op: (f'op_type: "{op}"') not in text, op_grad_type)):
                     raise AssertionError("Operator {} not found in {}.".format(" or ".join(op_grad_type), text))
             else:
-                self.assertIn('op_type: "%s"' % op_grad_type, str(onnx_graph_train))
+                self.assertIn(f'op_type: "{op_grad_type}"', str(onnx_graph_train))
 
     def get_torch_model_name(self, name, device):
         def from_numpy(v, device=None, requires_grad=False):
@@ -137,7 +137,7 @@ def forward(self, input1):
 
             return TestGatherElement, "GatherElementsGrad", dict(rtol=1e-04, atol=1e-05)
 
-        raise AssertionError("Unexpected name=%r." % name)
+        raise AssertionError(f"Unexpected name={name!r}.")
 
     def test_onnx_ops(self):
         for name in ["GatherElements", "Softmax"]:
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index d6f84d94c2838..5872a69dde876 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -201,7 +201,7 @@ def main():
         # Set log level
         numeric_level = getattr(logging, args.log_level.upper(), None)
         if not isinstance(numeric_level, int):
-            raise ValueError("Invalid log level: %s" % args.log_level)
+            raise ValueError(f"Invalid log level: {args.log_level}")
         logging.basicConfig(level=numeric_level)
     else:
         print("Training MNIST on vanilla PyTorch....")
diff --git a/orttraining/orttraining/test/python/orttraining_test_utilities.py b/orttraining/orttraining/test/python/orttraining_test_utilities.py
index 0892bafcdb95d..faa04f327be7f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_utilities.py
+++ b/orttraining/orttraining/test/python/orttraining_test_utilities.py
@@ -237,7 +237,7 @@ def test_data_flatten_and_unflatten(input_output_map, flag: int):
     flatten_schema = input_output_map[2]
 
     def _recursive_compare(real, expected):
-        assert type(real) == type(expected)
+        assert type(real) is type(expected)
         if isinstance(real, str):
             assert real == expected
         elif isinstance(real, abc.Sequence):
@@ -258,7 +258,7 @@ def _recursive_compare(real, expected):
         out, schema = extract_data_and_schema(raw_data)
         assert all([torch.allclose(o, d) if isinstance(o, torch.Tensor) else o == d for o, d in zip(out, flatten_data)])
         if not isinstance(raw_data, torch.Tensor):
-            assert type(schema) == type(raw_data)
+            assert type(schema) is type(raw_data)
 
         assert str(schema) == str(flatten_schema)
 
diff --git a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
index 9504ba2c1e69a..3daf6db96e31c 100644
--- a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
+++ b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc
@@ -72,7 +72,7 @@ void OpFunctionTester::RunFunctionBodyGraphOnCPU(TwoDArray& results) {
   }
 }
 
-OpFunctionTester::~OpFunctionTester(){};
+OpFunctionTester::~OpFunctionTester() {};
 
 template <class T>
 std::unique_ptr<T> CreateOpTester(const onnxruntime::training::OpDef& op_def,
diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h
index f58cd3ecbaeca..850dc6de735f0 100644
--- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h
+++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h
@@ -25,7 +25,7 @@ class PythonOp final : public OpKernel, public PythonOpBase {
 // Pytorch's torch.autograd.Function.backward(...) wrapper.
 class PythonOpGrad final : public OpKernel, public PythonOpGradBase {
  public:
-  PythonOpGrad(const OpKernelInfo& info) : OpKernel(info), PythonOpGradBase(info){};
+  PythonOpGrad(const OpKernelInfo& info) : OpKernel(info), PythonOpGradBase(info) {};
   Status Compute(OpKernelContext* context) const override;
 };
 
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index bcc9a06f5a250..dac1d7a84b9d9 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -271,258 +271,258 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Mega
 
 Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, View)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Group)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PassThrough)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SGDOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SGDOptimizerV2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int32_t, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SplitTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ConcatTraining)>,
-    // Adam
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AdamWOptimizer)>,
-
-    // Lamb
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float_float_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double_double_double_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float, InPlaceAccumulator)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulatorV2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulatorV2)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ZeroGradient)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ZeroGradient)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DropoutGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BitmaskDropoutGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasSoftmaxDropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxDropoutGrad)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, GatherNDGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropy)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropyGrad)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropy)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropy)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropyGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropyGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, LogSoftmaxGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxGrad_13)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, float, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ConvGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ConvGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ConvGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ConvTransposeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ConvTransposeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ConvTransposeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DivGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, DivGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DivGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasGeluGrad_dX)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasFastGeluGrad_dX)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, All)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SliceGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherElementsGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormInternal)>,
-
-    // Adam
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_float_float_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_float_float_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_BFloat16_float_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_BFloat16_float_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_BFloat16_float_BFloat16, AdamOptimizer)>,
-    // Lamb
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float_float_float_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_BFloat16_float_BFloat16_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_BFloat16_float_float_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double_double_double_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_BFloat16_BFloat16_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_BFloat16_float_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_float_BFloat16_BFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_float_float_BFloat16, LambOptimizer)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, InPlaceAccumulator)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalizationGrad)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, InplaceClipGradNorm)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, View)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Group)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PassThrough)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SGDOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SGDOptimizerV2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int32_t, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SplitTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ConcatTraining)>,
+      // Adam
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AdamWOptimizer)>,
+
+      // Lamb
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float_float_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double_double_double_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float, InPlaceAccumulator)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulatorV2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulatorV2)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ZeroGradient)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ZeroGradient)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DropoutGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BitmaskDropoutGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasSoftmaxDropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxDropoutGrad)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, int64_t, GatherNDGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropy)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropyGrad)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropy)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropy)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropyGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropyGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, LogSoftmaxGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SoftmaxGrad_13)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, float, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ConvGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ConvGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ConvGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ConvTransposeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ConvTransposeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ConvTransposeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DivGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, DivGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DivGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasGeluGrad_dX)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BiasFastGeluGrad_dX)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, All)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_MLFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SliceGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherElementsGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormInternal)>,
+
+      // Adam
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_float_float_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_float_float_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_float_BFloat16_float_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_int64_t_float_BFloat16_BFloat16_float_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_BFloat16_BFloat16_BFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_int64_t_float_BFloat16_BFloat16_float_BFloat16, AdamOptimizer)>,
+      // Lamb
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float_float_float_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_BFloat16_float_BFloat16_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_BFloat16_float_float_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double_double_double_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_BFloat16_BFloat16_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_BFloat16_float_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_float_BFloat16_BFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16_float_float_BFloat16, LambOptimizer)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, InPlaceAccumulator)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, SimplifiedLayerNormalizationGrad)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, InplaceClipGradNorm)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
 // the kernels within the following ifdef are not included in a build with
 // --enable_training_ops but without --enable_training
 #ifdef ENABLE_TRAINING
 // P2P communication operators.
 #if defined(ORT_USE_NCCL) || defined(USE_MPI)
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Send)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Recv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Send)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Recv)>,
 #endif
 
 #ifdef USE_MPI
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AdasumAllReduce)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AdasumAllReduce)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, WaitEvent)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, YieldOp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, WaitEvent)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, YieldOp)>,
 
 #ifdef ENABLE_TRITON
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, TritonOp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, TritonOp)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, GistPack1Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack1Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, GistPack1Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack1Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack8Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack8Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack16Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack16Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, GistPack1Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack1Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, bool, GistPack1Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack1Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack8Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack8Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack16Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPack16Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Decoder)>,
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOpGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOpGrad)>,
 #endif
 
 #ifdef ORT_USE_NCCL
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclAllReduce)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclAllGather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclReduceScatter)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MegatronF)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MegatronG)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclAllReduce)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclAllGather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, NcclReduceScatter)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MegatronF)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MegatronG)>,
 #endif
 #endif
   };
diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
index 7824e98fe8a53..c570f727f2a92 100644
--- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc
@@ -222,207 +222,207 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Mega
 
 Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, View)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Group)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PassThrough)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SGDOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ReduceSumTraining)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, int32_t, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ReduceSumTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SplitTraining)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ConcatTraining)>,
-    // Adam
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
-
-    // Lamb
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float_float_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double_double_double_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_float_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, InPlaceAccumulator)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float, InPlaceAccumulator)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ZeroGradient)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ZeroGradient)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, DropoutGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BitmaskDropoutGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasSoftmaxDropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxDropoutGrad)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, int64_t, GatherNDGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropy)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropyGrad)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropy)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropy)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropyGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropyGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, LogSoftmaxGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxGrad_13)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 12, 12, float, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, BFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, BatchNormalizationGrad)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ConvGrad)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ConvGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ConvGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GatherGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DivGrad)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, DivGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DivGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FastGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasGeluGrad_dX)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasFastGeluGrad_dX)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SigmoidGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, TanhGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, LeakyReluGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, IsFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, All)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, IsAllFinite)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_MLFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, LayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, InvertibleLayerNormalizationGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SliceGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GatherElementsGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, Scale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GistBinarizeEncoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GistBinarizeDecoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, GistPack1Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack1Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, GistPack1Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack1Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack8Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack8Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack16Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack16Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Encoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Decoder)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, BatchNormInternal)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormInternal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormInternal)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad)>,
-
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, View)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Group)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PassThrough)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SGDOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ReduceSumTraining)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, int32_t, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ReduceSumTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SplitTraining)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ConcatTraining)>,
+      // Adam
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_float_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_float_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, AdamOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float_MLFloat16_MLFloat16_float_MLFloat16, AdamOptimizer)>,
+
+      // Lamb
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float_float_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double_double_double_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_MLFloat16_float_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_MLFloat16_MLFloat16, LambOptimizer)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16_float_float_MLFloat16, LambOptimizer)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_MLFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, InPlaceAccumulator)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float, InPlaceAccumulator)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ZeroGradient)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ZeroGradient)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, DropoutGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BitmaskDropoutGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasSoftmaxDropout)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxDropoutGrad)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, int64_t, GatherNDGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropy)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SoftmaxCrossEntropyGrad)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropy)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropy)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int32_t, SparseSoftmaxCrossEntropyGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 9, float, int64_t, SparseSoftmaxCrossEntropyGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, LogSoftmaxGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SoftmaxGrad_13)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 12, 12, float, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, float, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 13, BFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, int64_t, SoftmaxCrossEntropyLossGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int64_t_float, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int64_t_MLFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_int64_t_BFloat16, SoftmaxCrossEntropyLossInternalGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, BatchNormalizationGrad)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ConvGrad)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ConvGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ConvGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GatherGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, DivGrad)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, DivGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, DivGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FastGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasGeluGrad_dX)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BiasFastGeluGrad_dX)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SigmoidGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, QuickGeluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, TanhGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, LeakyReluGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, IsFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, All)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, IsAllFinite)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_MLFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, LayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, SimplifiedLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_MLFloat16, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, InvertibleLayerNormalizationGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SliceGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, GatherElementsGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, Scale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GistBinarizeEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, GistBinarizeDecoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, GistPack1Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack1Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, bool, GistPack1Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack1Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack8Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack8Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, GistPack8Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack16Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPack16Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Encoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, GistPackMsfp15Decoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_float_float, BatchNormInternal)>,
+      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double_double_double, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_float, BatchNormInternal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_float_float, BatchNormInternal)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_float_BFloat16, LayerNormalizationGrad)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_float, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, FlattenAndUnpad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
 
 // P2P communication operators.
 #if defined(ORT_USE_NCCL) || defined(USE_MPI)
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Send)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Recv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Send)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Recv)>,
 #endif
 
 #ifdef USE_MPI
   // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AdasumAllReduce)>,
 #endif
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, RecordEvent)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, WaitEvent)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, YieldOp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, RecordEvent)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, WaitEvent)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, YieldOp)>,
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOpGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOpGrad)>,
 #endif
 
 #ifdef ORT_USE_NCCL
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclAllReduce)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclAllGather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclReduceScatter)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MegatronF)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MegatronG)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclAllReduce)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclAllGather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, NcclReduceScatter)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MegatronF)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MegatronG)>,
 #endif
   };
 
diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py
index 294af13fe69b7..50bfda4b407af 100644
--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@@ -192,7 +192,7 @@ def fix_transpose(model):
         weight = numpy_helper.to_array(t[1])
         assert len(weight.shape) == 2
         weight = weight.transpose(perm)
-        new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name)
+        new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed")
         model.graph.initializer.extend([new_weight])
         replace_input_arg(model, node.output[0], new_weight.name)
 
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index 2fb1936ff2184..e87429d10bf88 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -227,7 +227,7 @@ def fix_transpose(model):
         weight = numpy_helper.to_array(t[1])
         assert len(weight.shape) == 2
         weight = weight.transpose(perm)
-        new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name)
+        new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed")
         model.graph.initializer.extend([new_weight])
         replace_input_arg(model, node.output[0], new_weight.name)
 
diff --git a/pyproject.toml b/pyproject.toml
index 286e4f12721a2..1c3a719fb544a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,6 +84,7 @@ ignore = [
     "PYI041", # May create confusion
     "PYI024", # May create confusion
     "SIM102", # We don't perfer always combining if branches
+    "SIM103", # Do not collapse if-else
     "SIM108", # We don't encourage ternary operators
     "SIM114", # Don't combine if branches for debugability
     "SIM116", # Don't use dict lookup to replace if-else
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index d19ebe379b50b..7d384f7b1df67 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
-lintrunner-adapters>=0.11.0
+lintrunner-adapters>=0.12.4
 # RUFF
-ruff==0.3.2
+ruff==0.5.4
 # BLACK-ISORT
 black==24.2.0
-isort==5.12.0
+isort==5.13.2
 # CLANGFORMAT
-clang-format==17.0.4
+clang-format==18.1.8
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 98d9ba22b7190..587d035541c45 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -65,7 +65,7 @@ def _check_python_version():
 def _str_to_bool(s):
     """Convert string to bool (in argparse context)."""
     if s.lower() not in ["true", "false"]:
-        raise ValueError("Need bool; got %r" % s)
+        raise ValueError(f"Need bool; got {s!r}")
     return {"true": True, "false": False}[s.lower()]
 
 
@@ -806,7 +806,7 @@ def resolve_executable_path(command_or_path):
 def get_linux_distro():
     try:
         with open("/etc/os-release") as f:
-            dist_info = dict(line.strip().split("=", 1) for line in f.readlines())
+            dist_info = dict(line.strip().split("=", 1) for line in f)
         return dist_info.get("NAME", "").strip('"'), dist_info.get("VERSION", "").strip('"')
     except (OSError, ValueError):
         return "", ""
@@ -1236,7 +1236,7 @@ def generate_build_tree(
         cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"]
 
     if args.use_tvm and args.llvm_path is not None:
-        cmake_args += ["-DLLVM_DIR=%s" % args.llvm_path]
+        cmake_args += [f"-DLLVM_DIR={args.llvm_path}"]
 
     if args.use_cuda and not is_windows():
         nvml_stub_path = cuda_home + "/lib64/stubs"
@@ -1452,7 +1452,7 @@ def generate_build_tree(
     if args.enable_lazy_tensor:
         import torch
 
-        cmake_args += ["-Donnxruntime_PREBUILT_PYTORCH_PATH=%s" % os.path.dirname(torch.__file__)]
+        cmake_args += [f"-Donnxruntime_PREBUILT_PYTORCH_PATH={os.path.dirname(torch.__file__)}"]
         cmake_args += ["-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
 
     if args.use_azure:
@@ -1582,7 +1582,7 @@ def generate_build_tree(
                         else:
                             cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
                     if len(cuda_compile_flags_str) != 0:
-                        cudaflags.append('-Xcompiler="%s"' % cuda_compile_flags_str)
+                        cudaflags.append(f'-Xcompiler="{cuda_compile_flags_str}"')
             elif is_linux() or is_macOS():
                 if is_linux():
                     ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"]
@@ -1650,16 +1650,16 @@ def generate_build_tree(
         temp_cmake_args = cmake_args.copy()
         if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0:
             temp_cmake_args += [
-                "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)),
-                "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)),
+                "-DCMAKE_C_FLAGS={}".format(" ".join(cflags)),
+                "-DCMAKE_CXX_FLAGS={}".format(" ".join(cxxflags)),
             ]
         if cudaflags is not None and len(cudaflags) != 0:
-            temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT=%s" % (" ".join(cudaflags))]
+            temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT={}".format(" ".join(cudaflags))]
         if ldflags is not None and len(ldflags) != 0:
             temp_cmake_args += [
-                "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
-                "-DCMAKE_MODULE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
-                "-DCMAKE_SHARED_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+                "-DCMAKE_EXE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
+                "-DCMAKE_MODULE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
+                "-DCMAKE_SHARED_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
             ]
         run_subprocess(
             [
diff --git a/tools/ci_build/gen_def.py b/tools/ci_build/gen_def.py
index fe47d8dbe57fe..c4add6f0e8910 100755
--- a/tools/ci_build/gen_def.py
+++ b/tools/ci_build/gen_def.py
@@ -15,11 +15,11 @@ def parse_arguments():
 
 
 args = parse_arguments()
-print("Generating symbol file for %s" % str(args.config))
+print(f"Generating symbol file for {args.config!s}")
 with open(args.version_file) as f:
     VERSION_STRING = f.read().strip()
 
-print("VERSION:%s" % VERSION_STRING)
+print(f"VERSION:{VERSION_STRING}")
 
 symbols = set()
 for c in args.config:
@@ -41,16 +41,16 @@ def parse_arguments():
     elif args.style == "xcode":
         pass  # xcode compile don't has any header.
     else:
-        file.write("VERS_%s {\n" % VERSION_STRING)
+        file.write(f"VERS_{VERSION_STRING} {{\n")
         file.write(" global:\n")
 
     for symbol in symbols:
         if args.style == "vc":
             file.write(" %s @%d\n" % (symbol, symbol_index))
         elif args.style == "xcode":
-            file.write("_%s\n" % symbol)
+            file.write(f"_{symbol}\n")
         else:
-            file.write("  %s;\n" % symbol)
+            file.write(f"  {symbol};\n")
         symbol_index += 1
 
     if args.style == "gcc":
diff --git a/tools/ci_build/reduce_op_kernels.py b/tools/ci_build/reduce_op_kernels.py
index 6b73b1e063e58..df6bbf7a4058e 100755
--- a/tools/ci_build/reduce_op_kernels.py
+++ b/tools/ci_build/reduce_op_kernels.py
@@ -256,7 +256,7 @@ def _generate_type_control_overrides(ort_root: Path, build_dir: Path, cpp_lines:
         inserted = False
         with open(src) as input, open(target, "w") as output:
             inside_insertion_block = False
-            for line in input.readlines():
+            for line in input:
                 if "@@insertion_point_begin(allowed_types)@@" in line:
                     inside_insertion_block = True
                     output.write(line)
diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py
index ac4f515d5482b..37dad358a6feb 100644
--- a/tools/ci_build/replace_urls_in_deps.py
+++ b/tools/ci_build/replace_urls_in_deps.py
@@ -53,10 +53,10 @@ def main():
         csv_file_path = backup_csv_file_path
     else:
         # Make a copy before modifying it
-        print("Making a copy to %s" % str(backup_csv_file_path))
+        print(f"Making a copy to {backup_csv_file_path!s}")
         shutil.copy(csv_file_path, backup_csv_file_path)
 
-    print("Reading from %s" % str(csv_file_path))
+    print(f"Reading from {csv_file_path!s}")
     # Read the whole file into memory first
     with csv_file_path.open("r", encoding="utf-8") as f:
         depfile_reader = csv.reader(f, delimiter=";")
@@ -69,7 +69,7 @@ def main():
             deps.append(Dep(row[0], row[1], row[2]))
 
     csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt"
-    print("Writing to %s" % str(csv_file_path))
+    print(f"Writing to {csv_file_path!s}")
     # Write updated content back
     with csv_file_path.open("w", newline="", encoding="utf-8") as f:
         depfile_writer = csv.writer(f, delimiter=";")
diff --git a/tools/ci_build/upload_python_package_to_azure_storage.py b/tools/ci_build/upload_python_package_to_azure_storage.py
index b7969f02e518e..16ff5d1f71611 100755
--- a/tools/ci_build/upload_python_package_to_azure_storage.py
+++ b/tools/ci_build/upload_python_package_to_azure_storage.py
@@ -62,7 +62,7 @@ def upload_whl(python_wheel_path, final_storage=False):
 
         with open(download_path_to_html, "w") as f:
             for item in lines:
-                f.write("%s\n" % item)
+                f.write(f"{item}\n")
     else:
         warnings.warn(f"'{new_line}' exists in {download_path_to_html}. The html file is not updated.")
     run_subprocess(
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index 90d800f2a4498..587755d101ce2 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -26,7 +26,7 @@ def rename_folder(root):
         full_into = os.path.join(r, into)
         if os.path.exists(full_into):
             raise RuntimeError("%r already exists, previous documentation should be removed.")
-        print("rename %r" % full_src)
+        print(f"rename {full_src!r}")
         os.rename(full_src, full_into)
 
     return renamed
@@ -51,13 +51,13 @@ def replace_files(root, renamed):
                 for k, v in subs.items():
                     if k == v:
                         raise ValueError(f"{k!r} == {v!r}")
-                    if ('"%s' % k) in f[0]:
-                        repl.append((f[0], f[0].replace('"%s' % k, '"%s' % v)))
-                    if ("/%s" % k) in f[0]:
-                        repl.append((f[0], f[0].replace("/%s" % k, "/%s" % v)))
+                    if (f'"{k}') in f[0]:
+                        repl.append((f[0], f[0].replace(f'"{k}', f'"{v}')))
+                    if (f"/{k}") in f[0]:
+                        repl.append((f[0], f[0].replace(f"/{k}", f"/{v}")))
             if len(repl) == 0:
                 continue
-            print("update %r" % full)
+            print(f"update {full!r}")
             for k, v in repl:
                 content = content.replace(k, v)
             with open(full, "w", encoding="utf-8") as f:
@@ -71,7 +71,7 @@ def replace_files(root, renamed):
         root = sys.argv[-1]
     else:
         root = "../../build/docs/html"
-    print("look into %r" % root)
+    print(f"look into {root!r}")
     ren = rename_folder(root)
     if len(ren) == 0:
         ren = [
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 60d1884a9591f..a005bd4c4b89d 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -67,7 +67,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs,
                         and package_name != "Microsoft.ML.OnnxRuntime.Gpu.Linux"
                     ):
                         files_list.append(
-                            '<file src="' + str(child_file) + '" target="runtimes/win-%s/native"/>' % cpu_arch
+                            '<file src="' + str(child_file) + f'" target="runtimes/win-{cpu_arch}/native"/>'
                         )
         for cpu_arch in ["x86_64", "arm64"]:
             if child.name == get_package_name("osx", cpu_arch, ep, is_training_package):
@@ -79,7 +79,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs,
                     is_versioned_dylib = re.match(r".*[\.\d+]+\.dylib$", child_file.name)
                     if child_file.is_file() and child_file.suffix == ".dylib" and not is_versioned_dylib:
                         files_list.append(
-                            '<file src="' + str(child_file) + '" target="runtimes/osx-%s/native"/>' % cpu_arch
+                            '<file src="' + str(child_file) + f'" target="runtimes/osx-{cpu_arch}/native"/>'
                         )
         for cpu_arch in ["x64", "aarch64"]:
             if child.name == get_package_name("linux", cpu_arch, ep, is_training_package):
@@ -97,7 +97,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs,
                         and package_name != "Microsoft.ML.OnnxRuntime.Gpu.Windows"
                     ):
                         files_list.append(
-                            '<file src="' + str(child_file) + '" target="runtimes/linux-%s/native"/>' % cpu_arch
+                            '<file src="' + str(child_file) + f'" target="runtimes/linux-{cpu_arch}/native"/>'
                         )
 
         if child.name == "onnxruntime-android" or child.name == "onnxruntime-training-android":
diff --git a/tools/python/onnx_test_data_utils.py b/tools/python/onnx_test_data_utils.py
index 56485bb78abbd..d50d610a903b7 100644
--- a/tools/python/onnx_test_data_utils.py
+++ b/tools/python/onnx_test_data_utils.py
@@ -59,7 +59,7 @@ def image_to_numpy(filename, shape, channels_last, add_batch_dim):
         # target size.
         w_ratio = new_w / w
         h_ratio = new_h / h
-        ratio = w_ratio if w_ratio > h_ratio else h_ratio
+        ratio = max(h_ratio, w_ratio)
         interim_w = int(w * ratio)
         interim_h = int(h * ratio)
         img = img.resize((interim_w, interim_h), PIL.Image.ANTIALIAS)
diff --git a/tools/python/util/mobile_helpers/usability_checker.py b/tools/python/util/mobile_helpers/usability_checker.py
index 3d8042ad5412b..a8b5021f1387b 100644
--- a/tools/python/util/mobile_helpers/usability_checker.py
+++ b/tools/python/util/mobile_helpers/usability_checker.py
@@ -29,7 +29,7 @@ def __init__(self, filename):
         self._ops_seen = set()
 
         with open(filename) as f:
-            for line in f.readlines():
+            for line in f:
                 # we're looking for a markdown table with 2 columns. first is op name. second is caveats
                 # op name is domain:op
                 if line.startswith("|"):
diff --git a/tools/python/util/reduced_build_config_parser.py b/tools/python/util/reduced_build_config_parser.py
index cb90026808fde..be39562e2d60d 100644
--- a/tools/python/util/reduced_build_config_parser.py
+++ b/tools/python/util/reduced_build_config_parser.py
@@ -113,7 +113,7 @@ def process_non_op_line(line):
         return False
 
     with open(config_file) as config:
-        for line in [orig_line.strip() for orig_line in config.readlines()]:
+        for line in [orig_line.strip() for orig_line in config]:
             if process_non_op_line(line):
                 continue
 
diff --git a/winml/lib/Api.Image/CpuDetensorizer.h b/winml/lib/Api.Image/CpuDetensorizer.h
index e175fbbb4b6a3..04d828097ff3b 100644
--- a/winml/lib/Api.Image/CpuDetensorizer.h
+++ b/winml/lib/Api.Image/CpuDetensorizer.h
@@ -36,7 +36,8 @@ class CpuDetensorizer {
 
     auto nominalRangeConverter = NominalRangeConverter(pixelRange);
 
-    if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
+    if (formatFrom == formatTo &&
+        (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
       for (uint32_t i = 0; i < tensorHeight; i++) {
         BYTE* pPixel = pData;
 
@@ -52,7 +53,8 @@ class CpuDetensorizer {
 
         pData += bufferWidth;
       }
-    } else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) || (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) {
+    } else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) ||
+               (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) {
       for (uint32_t i = 0; i < tensorHeight; i++) {
         BYTE* pPixel = pData;
 
@@ -68,7 +70,8 @@ class CpuDetensorizer {
 
         pData += bufferWidth;
       }
-    } else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
+    } else if (formatFrom == kImageTensorChannelTypeGRAY8 &&
+               (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
       // just replicate the gray data across each channel
       for (uint32_t i = 0; i < end; i += bufferWidth) {
         for (uint32_t j = i; j < i + bytesPerRow; j += 4) {
diff --git a/winml/lib/Api.Image/CpuTensorizer.h b/winml/lib/Api.Image/CpuTensorizer.h
index ca5773b28fce2..ed9006470fd0e 100644
--- a/winml/lib/Api.Image/CpuTensorizer.h
+++ b/winml/lib/Api.Image/CpuTensorizer.h
@@ -39,7 +39,8 @@ class CpuTensorizer {
 
     auto nominalRangeConverter = NominalRangeConverter(pixelRange);
 
-    if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) {
+    if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 ||
+        formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) {
       // Convert BGR8 -> BGR8 or RGB8 -> RGB8
       for (uint64_t y = 0; y < yElements; y++) {
         DeinterleaveRowByteToFloat(
@@ -52,7 +53,8 @@ class CpuTensorizer {
           nominalRangeConverter
         );
       }
-    } else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) {
+    } else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 ||
+               formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) {
       // Convert RGB8 -> BGR8 or BGR8 -> RGB8
       for (uint32_t y = 0; y < yElements; y++) {
         DeinterleaveRowByteToFloat(
@@ -65,7 +67,8 @@ class CpuTensorizer {
           nominalRangeConverter
         );
       }
-    } else if (formatTo == kImageTensorChannelTypeGRAY8 && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
+    } else if (formatTo == kImageTensorChannelTypeGRAY8 &&
+               (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) {
       // Convert BGR8 -> GRAY8 or RGB8 -> GRAY8
       uint32_t blueIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 0 : 2;
       uint32_t redIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 2 : 0;
@@ -80,7 +83,8 @@ class CpuTensorizer {
           pixelInd++;
         }
       }
-    } else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
+    } else if (formatFrom == kImageTensorChannelTypeGRAY8 &&
+               (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) {
       // Convert GRAY8 -> BGR8 or GRAY8 -> RGB8
       for (UINT32 i = start; i < end; i += bufferWidth) {
         for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) {
diff --git a/winml/lib/Api.Image/D3DDeviceCache.cpp b/winml/lib/Api.Image/D3DDeviceCache.cpp
index 977f2ba75216a..549a7bba77ef6 100644
--- a/winml/lib/Api.Image/D3DDeviceCache.cpp
+++ b/winml/lib/Api.Image/D3DDeviceCache.cpp
@@ -349,7 +349,8 @@ ID3D12RootSignature* D3DDeviceCache::GetTensorizeRootSignature() {
       newRootSignature->SetName(L"Tensorize Rootsignature");
     }
 
-    if (InterlockedCompareExchangePointer(tensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == nullptr) {
+    if (InterlockedCompareExchangePointer(tensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) ==
+        nullptr) {
       // This thread won the race and just cached the PSO
       newRootSignature.detach();
     }
@@ -401,7 +402,8 @@ ID3D12RootSignature* D3DDeviceCache::GetDetensorizeRootSignature() {
       newRootSignature->SetName(L"Detensorize Rootsignature");
     }
 
-    if (InterlockedCompareExchangePointer(detensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == nullptr) {
+    if (InterlockedCompareExchangePointer(detensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) ==
+        nullptr) {
       // This thread won the race and just cached the PSO
       newRootSignature.detach();
     }
@@ -416,7 +418,8 @@ ID3D12PipelineState* D3DDeviceCache::GetCachedPipelineState(
   PipelineStateCacheFormat formatTo,
   PipelineStateCacheOperation operation
 ) {
-  if (cached_pipeline_state[static_cast<int>(type)][static_cast<int>(formatFrom)][static_cast<int>(formatTo)][static_cast<int>(operation)] == nullptr) {
+  if (cached_pipeline_state[static_cast<int>(type)][static_cast<int>(formatFrom)][static_cast<int>(formatTo)]
+                           [static_cast<int>(operation)] == nullptr) {
     winrt::com_ptr<ID3D12PipelineState> newPSO;
     if (operation == PipelineStateCacheOperation::kTensorize) {
       newPSO.attach(CreateTensorizePipelineState(type, formatFrom, formatTo));
@@ -425,12 +428,12 @@ ID3D12PipelineState* D3DDeviceCache::GetCachedPipelineState(
     }
 
     if (InterlockedCompareExchangePointer(
-                cached_pipeline_state[static_cast<int>(type)][static_cast<int>(formatFrom)][static_cast<int>(formatTo)]
-                                     [static_cast<int>(operation)]
-                                         .put_void(),
-                newPSO.get(),
-                nullptr
-            ) == nullptr) {
+          cached_pipeline_state[static_cast<int>(type)][static_cast<int>(formatFrom)][static_cast<int>(formatTo)]
+                               [static_cast<int>(operation)]
+                                 .put_void(),
+          newPSO.get(),
+          nullptr
+        ) == nullptr) {
       // This thread won the race and just cached the PSO
       newPSO.detach();
     }
@@ -653,7 +656,8 @@ ID3D12Resource* D3DDeviceCache::GetDetensorizeVertexBuffer(_Out_ UINT* vertexBuf
     memcpy(pVertexDataBegin, triangleVertices, sizeof(triangleVertices));
     newResource->Unmap(0, nullptr);
 
-    if (InterlockedCompareExchangePointer(detensorize_vertex_buffer_.put_void(), newResource.get(), nullptr) == nullptr) {
+    if (InterlockedCompareExchangePointer(detensorize_vertex_buffer_.put_void(), newResource.get(), nullptr) ==
+        nullptr) {
       // This thread won the race and just cached the PSO
       newResource.detach();
     }
diff --git a/winml/lib/Api.Image/EventTimer.h b/winml/lib/Api.Image/EventTimer.h
index 3620a7a2c0ee1..590675646b70d 100644
--- a/winml/lib/Api.Image/EventTimer.h
+++ b/winml/lib/Api.Image/EventTimer.h
@@ -4,7 +4,9 @@ class EventTimer {
  public:
   bool Start() {
     auto now = std::chrono::high_resolution_clock::now();
-    if (!_started || std::chrono::duration_cast<std::chrono::microseconds>(now - _startTime).count() > _kDurationBetweenSendingEvents) {
+    if (!_started ||
+        std::chrono::duration_cast<std::chrono::microseconds>(now - _startTime).count() >
+          _kDurationBetweenSendingEvents) {
       _started = true;
       _startTime = std::chrono::high_resolution_clock::now();
       return true;
diff --git a/winml/lib/Api.Image/ImageConversionHelpers.cpp b/winml/lib/Api.Image/ImageConversionHelpers.cpp
index 11434c5fffb8e..441413bface28 100644
--- a/winml/lib/Api.Image/ImageConversionHelpers.cpp
+++ b/winml/lib/Api.Image/ImageConversionHelpers.cpp
@@ -69,7 +69,8 @@ void _winmli::ConvertVideoFrameToVideoFrame(
   wgdx::Direct3D11::IDirect3DSurface spInputDirect3DSurface = inputVideoFrame.Direct3DSurface();
 
   // only one of softwarebitmap or direct3Dsurface should be non-null
-  if ((spInputSoftwareBitmap == nullptr && spInputDirect3DSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputDirect3DSurface != nullptr)) {
+  if ((spInputSoftwareBitmap == nullptr && spInputDirect3DSurface == nullptr) ||
+      (spInputSoftwareBitmap != nullptr && spInputDirect3DSurface != nullptr)) {
     WINML_THROW_HR(E_INVALIDARG);
   }
 
@@ -133,11 +134,9 @@ bool _winmli::NeedsVideoFrameConversion(
 
   if (FAILED((hr = GetVideoFrameInfo(inputVideoFrame, format, width, height, luid)))) {
     bNeedConversion = true;
-  } else if (((int)inputBounds.Width != outputWidth) ||
-            (inputBounds.X != 0) ||
-            ((int)inputBounds.Height != outputHeight) ||
-            (inputBounds.Y != 0) ||
-            (inputVideoFrame == nullptr))  // Check crop
+  } else if (((int)inputBounds.Width != outputWidth) || (inputBounds.X != 0) ||
+             ((int)inputBounds.Height != outputHeight) || (inputBounds.Y != 0) ||
+             (inputVideoFrame == nullptr))  // Check crop
   {
     bNeedConversion = true;
   } else if (luid.HighPart != outputLuid.HighPart || luid.LowPart != outputLuid.LowPart) {
diff --git a/winml/lib/Api.Image/ImageConverter.cpp b/winml/lib/Api.Image/ImageConverter.cpp
index bb97f0ec7ff34..84b6f5a3a4c5c 100644
--- a/winml/lib/Api.Image/ImageConverter.cpp
+++ b/winml/lib/Api.Image/ImageConverter.cpp
@@ -50,7 +50,8 @@ ComPtr<ID3D11Fence> ImageConverter::FetchOrCreateFenceOnDevice(
   ComPtr<ID3D11Fence> fence;
   UINT comPtrSize = static_cast<UINT>(sizeof(fence.GetAddressOf()));
 
-  if (FAILED(pD3D11Device->GetPrivateData(device_cache.GetFenceGuid(), &comPtrSize, fence.GetAddressOf())) || fence.Get() == nullptr) {
+  if (FAILED(pD3D11Device->GetPrivateData(device_cache.GetFenceGuid(), &comPtrSize, fence.GetAddressOf())) ||
+      fence.Get() == nullptr) {
     // There's no fence on the device, so create a new one
     ComPtr<ID3D11Device5> spD3D11Device5;
     WINML_THROW_IF_FAILED(pD3D11Device->QueryInterface(IID_PPV_ARGS(&spD3D11Device5)));
diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
index 76a5623c5b4a5..456931d21e0a8 100644
--- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
+++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
@@ -196,8 +196,9 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame(
         UINT comPtrSize = static_cast<UINT>(sizeof(spSharedD3D11Texture.GetAddressOf()));
         UINT handleSize = static_cast<UINT>(sizeof(sharedHandle));
 
-        if ((FAILED(spVideoFrameTexture->GetPrivateData(
-                 _d3d11TextureGUID, &comPtrSize, spSharedD3D11Texture.GetAddressOf())) ||
+        if ((FAILED(
+               spVideoFrameTexture->GetPrivateData(_d3d11TextureGUID, &comPtrSize, spSharedD3D11Texture.GetAddressOf())
+             ) ||
              !spSharedD3D11Texture.Get()) ||
             (FAILED(spVideoFrameTexture->GetPrivateData(_handleGUID, &handleSize, &sharedHandle)) ||
              sharedHandle != shared_handle_)) {
@@ -365,7 +366,8 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame(
   wgdx::Direct3D11::IDirect3DSurface spOutputSurface = pDestVideoFrame.Direct3DSurface();
 
   // only one of softwarebitmap or direct3Dsurface should be non-null
-  if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) || (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) {
+  if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) ||
+      (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) {
     WINML_THROW_HR(E_INVALIDARG);
   }
   if (spOutputSoftwareBitmap) {
@@ -381,7 +383,10 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame(
   if (_winmli::NeedsVideoFrameConversion(
         pDestVideoFrame, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight
       )) {
-    if (converted_video_frame_ == nullptr || _winmli::NeedsVideoFrameConversion(converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight)) {
+    if (converted_video_frame_ == nullptr ||
+        _winmli::NeedsVideoFrameConversion(
+          converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight
+        )) {
       converted_video_frame_ = wm::VideoFrame::CreateWithSoftwareBitmap(
         wgi::SoftwareBitmap(wgi::BitmapPixelFormat::Bgra8, tensorWidth, tensorHeight)
       );
diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
index 0a763c77c94f4..a9b507ae4e16f 100644
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@@ -138,14 +138,19 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor(
   wgdx::Direct3D11::IDirect3DSurface spInputSurface = inputVideoFrame.Direct3DSurface();
 
   // only one of softwarebitmap or direct3Dsurface should be non-null
-  if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) {
+  if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) ||
+      (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) {
     WINML_THROW_IF_FAILED(E_INVALIDARG);
   }
 
   UINT32 tensorHeight = static_cast<UINT32>(tensorDesc.sizes[2]);
   UINT32 tensorWidth = static_cast<UINT32>(tensorDesc.sizes[3]);
-  if (spInputSurface || _winmli::NeedsVideoFrameConversion(inputVideoFrame, {}, inputBounds, tensorWidth, tensorHeight)) {
-    if (converted_video_frame_ == nullptr || _winmli::NeedsVideoFrameConversion(converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight)) {
+  if (spInputSurface ||
+      _winmli::NeedsVideoFrameConversion(inputVideoFrame, {}, inputBounds, tensorWidth, tensorHeight)) {
+    if (converted_video_frame_ == nullptr ||
+        _winmli::NeedsVideoFrameConversion(
+          converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight
+        )) {
       converted_video_frame_ = wm::VideoFrame::CreateWithSoftwareBitmap(
         wgi::SoftwareBitmap(wgi::BitmapPixelFormat::Bgra8, tensorWidth, tensorHeight)
       );
@@ -236,8 +241,8 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
 
     // TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor
     if (!_winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format) ||
-            static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] ||
-            static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
+        static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] ||
+        static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
       // Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway
       wgdx::DirectXPixelFormat newFormat = _winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format)
         ? spDirect3DSurface.Description().Format
@@ -269,7 +274,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
         D3D11_cached_texture_->GetDesc(&cachedTextureDesc);
 
         if (cachedTextureDesc.Width != scaledBounds.Width || cachedTextureDesc.Height != scaledBounds.Height ||
-                    cachedTextureDesc.Format != videoFrameTextureDesc.Format) {
+            cachedTextureDesc.Format != videoFrameTextureDesc.Format) {
           // The dimensions or format don't match, so we need to re-create our texture
           WINML_THROW_IF_FAILED(
             pDeviceCache->GetD3D11Device()->CreateTexture2D(&videoFrameTextureDesc, nullptr, &D3D11_cached_texture_)
@@ -289,12 +294,12 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
       UINT comPtrSize = static_cast<UINT>(sizeof(spSharedD3D11Texture.GetAddressOf()));
       UINT handleSize = static_cast<UINT>(sizeof(sharedHandle));
 
-      if ((FAILED(spVideoFrameTexture->GetPrivateData(
-                     d3d11_texture_GUID_, &comPtrSize, spSharedD3D11Texture.GetAddressOf()
-                 )) ||
-                 !spSharedD3D11Texture.Get()) ||
-                (FAILED(spVideoFrameTexture->GetPrivateData(handle_GUID_, &handleSize, &sharedHandle)) ||
-                 sharedHandle != shared_handle_)) {
+      if ((FAILED(
+             spVideoFrameTexture->GetPrivateData(d3d11_texture_GUID_, &comPtrSize, spSharedD3D11Texture.GetAddressOf())
+           ) ||
+           !spSharedD3D11Texture.Get()) ||
+          (FAILED(spVideoFrameTexture->GetPrivateData(handle_GUID_, &handleSize, &sharedHandle)) ||
+           sharedHandle != shared_handle_)) {
         // Create a new shared texture that we cache on the video frame texture
         WINML_THROW_IF_FAILED(spTextureDevice->CreateTexture2D(&videoFrameTextureDesc, nullptr, &spSharedD3D11Texture));
 
@@ -423,9 +428,9 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
     WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize));
 
     if (outputDesc.Width < ullTensorSize || outputDesc.Height != 1 ||
-            outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
-            !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) ||
-            outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) {
+        outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
+        !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) ||
+        outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) {
       WINML_THROW_IF_FAILED(E_INVALIDARG);
     }
   }
@@ -565,7 +570,8 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
   wgi::BitmapBounds scaledBounds = inputBounds;
 
   // TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor
-  if (static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] || static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
+  if (static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] ||
+      static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
     scaledBounds = {0, 0, static_cast<uint32_t>(tensorDesc.sizes[3]), static_cast<uint32_t>(tensorDesc.sizes[2])};
 
     // Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway
diff --git a/winml/lib/Api.Image/inc/ConverterResourceStore.h b/winml/lib/Api.Image/inc/ConverterResourceStore.h
index ffb413e0b92f3..24406c9fdaaef 100644
--- a/winml/lib/Api.Image/inc/ConverterResourceStore.h
+++ b/winml/lib/Api.Image/inc/ConverterResourceStore.h
@@ -25,7 +25,7 @@ struct ConverterResourceDescription {
     // 2) the resources are on different devices
     // 3) the resources have different pixel formats
     if (desc.width != width || desc.height != height || desc.luid.HighPart != luid.HighPart ||
-            desc.luid.LowPart != luid.LowPart || desc.pixel_format != pixel_format) {
+        desc.luid.LowPart != luid.LowPart || desc.pixel_format != pixel_format) {
       return false;
     }
 
diff --git a/winml/lib/Api/FeatureValues.h b/winml/lib/Api/FeatureValues.h
index a330b244d40fc..fe6429f0a421b 100644
--- a/winml/lib/Api/FeatureValues.h
+++ b/winml/lib/Api/FeatureValues.h
@@ -29,37 +29,37 @@
 #include "ImageFeatureValue.h"
 
 // CREATE_TENSOR is used by data tensor types to implement common functionality
-#define CREATE_TENSOR(type, element_type, element_view_type)                                       \
-  namespace WINMLP {                                                                               \
-  struct type : public _winml::TensorBase<                                                         \
-                  element_type,                                                                    \
-                  element_view_type,                                                               \
-                  type,                                                                            \
-                  I##type,                                                                         \
-                  type##T<type, ITensorNative, _winml::ILotusValueProviderPrivate>> {              \
-    using Base = TensorBase<                                                                       \
-      element_type,                                                                                \
-      element_view_type,                                                                           \
-      type,                                                                                        \
-      I##type,                                                                                     \
-      type##T<type, ITensorNative, _winml::ILotusValueProviderPrivate>>;                           \
-                                                                                                   \
-    type() = default;                                                                              \
-                                                                                                   \
-    type(wfc::IIterable<int64_t> const& shape) : Base(shape){};                                    \
-                                                                                                   \
-    type(std::vector<int64_t> const& shape) : Base(shape){};                                       \
-                                                                                                   \
-    type(std::vector<int64_t> const& shape, ID3D12Resource* pResource) : Base(shape, pResource){}; \
-  };                                                                                               \
-  }                                                                                                \
-  namespace WINML::factory_implementation {                                                        \
-  struct type : type##T<type, winmlp::type, ITensorStaticsNative> {                                \
-    STDMETHOD(CreateFromD3D12Resource)                                                             \
-    (ID3D12Resource * value, __int64* shape, int shapeSize, IUnknown** result) {                   \
-      return winmlp::type::CreateFromD3D12Resource(value, shape, shapeSize, result);               \
-    }                                                                                              \
-  };                                                                                               \
+#define CREATE_TENSOR(type, element_type, element_view_type)                                        \
+  namespace WINMLP {                                                                                \
+  struct type : public _winml::TensorBase<                                                          \
+                  element_type,                                                                     \
+                  element_view_type,                                                                \
+                  type,                                                                             \
+                  I##type,                                                                          \
+                  type##T<type, ITensorNative, _winml::ILotusValueProviderPrivate>> {               \
+    using Base = TensorBase<                                                                        \
+      element_type,                                                                                 \
+      element_view_type,                                                                            \
+      type,                                                                                         \
+      I##type,                                                                                      \
+      type##T<type, ITensorNative, _winml::ILotusValueProviderPrivate>>;                            \
+                                                                                                    \
+    type() = default;                                                                               \
+                                                                                                    \
+    type(wfc::IIterable<int64_t> const& shape) : Base(shape) {};                                    \
+                                                                                                    \
+    type(std::vector<int64_t> const& shape) : Base(shape) {};                                       \
+                                                                                                    \
+    type(std::vector<int64_t> const& shape, ID3D12Resource* pResource) : Base(shape, pResource) {}; \
+  };                                                                                                \
+  }                                                                                                 \
+  namespace WINML::factory_implementation {                                                         \
+  struct type : type##T<type, winmlp::type, ITensorStaticsNative> {                                 \
+    STDMETHOD(CreateFromD3D12Resource)                                                              \
+    (ID3D12Resource * value, __int64* shape, int shapeSize, IUnknown** result) {                    \
+      return winmlp::type::CreateFromD3D12Resource(value, shape, shapeSize, result);                \
+    }                                                                                               \
+  };                                                                                                \
   }
 
 CREATE_TENSOR(TensorBoolean, bool, bool)
@@ -86,11 +86,11 @@ CREATE_TENSOR(TensorString, std::string, winrt::hstring)
 #pragma warning(pop)
 
 // CREATE_MAP is used by map types to implement common functionality
-#define CREATE_MAP(type, key_type, value_type)                                                       \
-  namespace WINMLP {                                                                                 \
-  struct type : public _winml::MapBase<type, key_type, value_type> {                                 \
-    type(wfc::IMap<key_type, value_type> const& data) : MapBase<type, key_type, value_type>(data){}; \
-  };                                                                                                 \
+#define CREATE_MAP(type, key_type, value_type)                                                        \
+  namespace WINMLP {                                                                                  \
+  struct type : public _winml::MapBase<type, key_type, value_type> {                                  \
+    type(wfc::IMap<key_type, value_type> const& data) : MapBase<type, key_type, value_type>(data) {}; \
+  };                                                                                                  \
   }
 
 CREATE_MAP(MapInt64BitToInt64Bit, int64_t, int64_t)
@@ -103,11 +103,11 @@ CREATE_MAP(MapStringToDouble, hstring, double)
 CREATE_MAP(MapStringToString, hstring, hstring)
 
 // CREATE_SEQUENCE is used by sequence types to implement common functionality
-#define CREATE_SEQUENCE(type, element_type, raw_type)                                                    \
-  namespace WINMLP {                                                                                     \
-  struct type : public _winml::SequenceBase<type, element_type, raw_type> {                              \
-    type(wfc::IIterable<element_type> const& data) : SequenceBase<type, element_type, raw_type>(data){}; \
-  };                                                                                                     \
+#define CREATE_SEQUENCE(type, element_type, raw_type)                                                     \
+  namespace WINMLP {                                                                                      \
+  struct type : public _winml::SequenceBase<type, element_type, raw_type> {                               \
+    type(wfc::IIterable<element_type> const& data) : SequenceBase<type, element_type, raw_type>(data) {}; \
+  };                                                                                                      \
   }
 
 using AbiMapStringFloat = wfc::IMap<winrt::hstring, float>;
diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp
index 8628c578e5004..65f2e56180e19 100644
--- a/winml/lib/Api/ImageFeatureValue.cpp
+++ b/winml/lib/Api/ImageFeatureValue.cpp
@@ -221,7 +221,9 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor(
     THROW_HR(E_NOTIMPL);
   }
 
-  if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && pixelRange != winml::LearningModelPixelRange::ZeroToOne && pixelRange != winml::LearningModelPixelRange::MinusOneToOne) {
+  if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 &&
+      pixelRange != winml::LearningModelPixelRange::ZeroToOne &&
+      pixelRange != winml::LearningModelPixelRange::MinusOneToOne) {
     THROW_HR(E_NOTIMPL);
   }
 
@@ -331,12 +333,11 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
     // The the widths and heights of input data must be the same. Or the
     // tensorDescriptor cannot describ the shape of the inputs.
     if (spImageDescriptor->Width() == MAXUINT32 &&
-            !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to<uint32_t>()) == m_widths.end())) {
+        !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to<uint32_t>()) == m_widths.end())) {
       THROW_HR(E_INVALIDARG);
     }
     if (spImageDescriptor->Height() == MAXUINT32 &&
-            !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to<uint32_t>()) == m_heights.end()
-            )) {
+        !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to<uint32_t>()) == m_heights.end())) {
       THROW_HR(E_INVALIDARG);
     }
     descriptorWidth = (spImageDescriptor->Width() == MAXUINT32) ? m_widths[0] : spImageDescriptor->Width();
@@ -354,12 +355,11 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
       return {};
     }
     if (-1 == shape.GetAt(3) &&
-            !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to<uint32_t>()) == m_widths.end())) {
+        !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to<uint32_t>()) == m_widths.end())) {
       THROW_HR(E_INVALIDARG);
     }
     if (-1 == shape.GetAt(2) &&
-            !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to<uint32_t>()) == m_heights.end()
-            )) {
+        !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to<uint32_t>()) == m_heights.end())) {
       THROW_HR(E_INVALIDARG);
     }
     descriptorWidth = (-1 == shape.GetAt(3)) ? m_widths[0] : static_cast<uint32_t>(shape.GetAt(3));
diff --git a/winml/lib/Api/LearningModel.cpp b/winml/lib/Api/LearningModel.cpp
index 6d7c8317ce5f9..8de14a5dfce10 100644
--- a/winml/lib/Api/LearningModel.cpp
+++ b/winml/lib/Api/LearningModel.cpp
@@ -64,7 +64,7 @@ LearningModel::LearningModel(const hstring& path, const winml::ILearningModelOpe
 
   WINML_THROW_IF_FAILED(CreateOnnxruntimeEngineFactory(engine_factory_.put()));
 
-  wil::unique_handle file_handle {
+  wil::unique_handle file_handle{
 #if WINVER >= _WIN32_WINNT_WIN8
     CreateFile2(path.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL)
   };
diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
index 011a4a718f82a..57bafda57fe54 100644
--- a/winml/lib/Api/LearningModelSession.cpp
+++ b/winml/lib/Api/LearningModelSession.cpp
@@ -21,8 +21,8 @@ static const auto c_enable_debug_output = L"EnableDebugOutput";
 namespace guid_details {
 // This GUID is to be used for delimiting ML-related categories of capturable work.
 // {D113B493-BBA2-4993-8608-D706A73B91CE}
-struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) __declspec(novtable
-) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {};
+struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE"))
+__declspec(novtable) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {};
 }  // namespace guid_details
 static const GUID WINML_PIX_EVAL_CAPTURABLE_WORK_GUID = __uuidof(guid_details::WINML_PIX_EVAL_CAPTURABLE_WORK_GUID);
 
diff --git a/winml/lib/Api/NumericData.cpp b/winml/lib/Api/NumericData.cpp
index ae5f9155d425c..1e3ba5438c10a 100644
--- a/winml/lib/Api/NumericData.cpp
+++ b/winml/lib/Api/NumericData.cpp
@@ -68,9 +68,7 @@ gsl::span<byte> numeric_data::buffer(bool should_sync_buffer) {
   }
   auto span = combined_buffer();
   if (should_sync_buffer) {
-    _winml::LoadSpanFromDisjointBuffers(
-      buffers_.size(), [this](size_t i) { return buffer_at(i); }, span
-    );
+    _winml::LoadSpanFromDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span);
   }
 
   return span;
@@ -80,9 +78,7 @@ bool numeric_data::flush() {
   auto should_flush = buffers_.size() != 1;
   if (should_flush) {
     auto span = combined_buffer();
-    _winml::StoreSpanIntoDisjointBuffers(
-      buffers_.size(), [this](size_t i) { return buffer_at(i); }, span
-    );
+    _winml::StoreSpanIntoDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span);
   }
   return should_flush;
 }
@@ -97,9 +93,7 @@ void numeric_data::set(size_t data_size, const byte* data) {
   );
 
   gsl::span<byte> span(const_cast<byte*>(data), data_size);
-  _winml::StoreSpanIntoDisjointBuffers(
-    buffers_.size(), [this](size_t i) { return buffer_at(i); }, span
-  );
+  _winml::StoreSpanIntoDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span);
 }
 
 static gsl::span<byte> get_span_from_ibuffer(wss::IBuffer buffer) {
diff --git a/winml/lib/Api/impl/FeatureCompatibility.h b/winml/lib/Api/impl/FeatureCompatibility.h
index 3fff488be23f7..1b124097f3f80 100644
--- a/winml/lib/Api/impl/FeatureCompatibility.h
+++ b/winml/lib/Api/impl/FeatureCompatibility.h
@@ -375,11 +375,11 @@ static void (*FeatureKindCompatibilityMatrix[4][4])(
 ) = {
   //                 Tensor,                          Sequence,                           Map,                    Image
   /* Tensor */ {verify<K::Tensor, K::Tensor>, not_compatible, not_compatible, verify<K::Tensor, K::Image>},
- /* Sequence */
+  /* Sequence */
   {not_compatible, verify<K::Sequence, K::Sequence>, not_compatible, not_compatible},
- /* Map */
+  /* Map */
   {not_compatible, not_compatible, verify<K::Map, K::Map>, not_compatible},
- /* Image */
+  /* Image */
   {verify<K::Image, K::Tensor>, not_compatible, not_compatible, verify<K::Image, K::Image>}
 };
 }  // namespace compatibility_details
diff --git a/winml/lib/Common/CommonDeviceHelpers.cpp b/winml/lib/Common/CommonDeviceHelpers.cpp
index 01615005a8947..b4ada6c498212 100644
--- a/winml/lib/Common/CommonDeviceHelpers.cpp
+++ b/winml/lib/Common/CommonDeviceHelpers.cpp
@@ -65,8 +65,10 @@ HRESULT GetDXCoreAdapterMetadata(
   RETURN_IF_FAILED(spFactory->GetAdapterByLuid(device.GetAdapterLuid(), IID_PPV_ARGS(spAdapter.put())));
 
   if (spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE) &&
-        (!(spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS) ||
-           spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D11_GRAPHICS)))) {
+      (!(
+        spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS) ||
+        spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D11_GRAPHICS)
+      ))) {
     isMcdmAdapter = true;
   } else {
     isMcdmAdapter = false;
diff --git a/winml/test/api/raw/buffer_backed_random_access_stream_reference.h b/winml/test/api/raw/buffer_backed_random_access_stream_reference.h
index e9539c188e45a..6f492bf8340c9 100644
--- a/winml/test/api/raw/buffer_backed_random_access_stream_reference.h
+++ b/winml/test/api/raw/buffer_backed_random_access_stream_reference.h
@@ -347,8 +347,9 @@ struct BufferBackedRandomAccessStreamReference
   }
 
   virtual HRESULT STDMETHODCALLTYPE OpenReadAsync(
-    /* [retval, out] */ __RPC__deref_out_opt
-      __FIAsyncOperation_1_Windows__CStorage__CStreams__CIRandomAccessStreamWithContentType** operation
+    /* [retval, out] */
+    __RPC__deref_out_opt __FIAsyncOperation_1_Windows__CStorage__CStreams__CIRandomAccessStreamWithContentType**
+      operation
   ) override {
     auto open_read_async = Microsoft::WRL::Make<BufferBackedRandomAccessStreamReferenceOpenReadAsync>();
     open_read_async.CopyTo(operation);
diff --git a/winml/test/api/raw/winml_microsoft.h b/winml/test/api/raw/winml_microsoft.h
index 92094188793d5..60527b238d8cd 100644
--- a/winml/test/api/raw/winml_microsoft.h
+++ b/winml/test/api/raw/winml_microsoft.h
@@ -141,8 +141,8 @@ struct TensorRuntimeClassID<double> {
   static const wchar_t* RuntimeClass_ID;
 };
 
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<float>::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<float>::RuntimeClass_ID =
+  RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<float16>::RuntimeClass_ID =
   RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat16Bit;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<int8_t>::RuntimeClass_ID =
@@ -161,10 +161,10 @@ __declspec(selectany) const wchar_t* TensorRuntimeClassID<uint64_t>::RuntimeClas
   RuntimeClass_Microsoft_AI_MachineLearning_TensorUInt64Bit;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<int64_t>::RuntimeClass_ID =
   RuntimeClass_Microsoft_AI_MachineLearning_TensorInt64Bit;
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<bool>::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorBoolean;
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<double>::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorDouble;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<bool>::RuntimeClass_ID =
+  RuntimeClass_Microsoft_AI_MachineLearning_TensorBoolean;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<double>::RuntimeClass_ID =
+  RuntimeClass_Microsoft_AI_MachineLearning_TensorDouble;
 
 template <typename T>
 struct TensorFactory {};
@@ -319,30 +319,30 @@ struct TensorFactoryIID<double> {
   static const GUID IID;
 };
 
-__declspec(selectany
-) const GUID TensorFactoryIID<float>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<float16>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int8_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint8_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint16_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int16_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint32_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int32_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint64_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int64_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<bool>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<double>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics;
+__declspec(selectany) const GUID TensorFactoryIID<float>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics;
+__declspec(selectany) const GUID TensorFactoryIID<float16>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int8_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint8_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint16_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int16_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint32_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int32_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint64_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int64_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<bool>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics;
+__declspec(selectany) const GUID TensorFactoryIID<double>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics;
 
 template <typename T>
 struct TensorFactory2IID {};
@@ -395,30 +395,30 @@ struct TensorFactory2IID<double> {
   static const GUID IID;
 };
 
-__declspec(selectany
-) const GUID TensorFactory2IID<float>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<float16>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int8_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint8_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint16_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int16_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint32_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int32_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint64_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int64_t>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<bool>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<double>::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<float>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<float16>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int8_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint8_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint16_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int16_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint32_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int32_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint64_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int64_t>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<bool>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<double>::IID =
+  ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics2;
 
 inline HRESULT GetActivationFactory(const wchar_t* p_class_id, const IID& iid, void** factory) noexcept {
   // Fallback to OS binary if the redistributable is not present!
diff --git a/winml/test/api/raw/winml_windows.h b/winml/test/api/raw/winml_windows.h
index 944daff6dd10a..8e72743f3d98b 100644
--- a/winml/test/api/raw/winml_windows.h
+++ b/winml/test/api/raw/winml_windows.h
@@ -141,12 +141,12 @@ struct TensorRuntimeClassID<double> {
   static const wchar_t* RuntimeClass_ID;
 };
 
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<float>::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorFloat;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<float>::RuntimeClass_ID =
+  RuntimeClass_Windows_AI_MachineLearning_TensorFloat;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<float16>::RuntimeClass_ID =
   RuntimeClass_Windows_AI_MachineLearning_TensorFloat16Bit;
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<int8_t>::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorInt8Bit;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<int8_t>::RuntimeClass_ID =
+  RuntimeClass_Windows_AI_MachineLearning_TensorInt8Bit;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<uint8_t>::RuntimeClass_ID =
   RuntimeClass_Windows_AI_MachineLearning_TensorUInt8Bit;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<uint16_t>::RuntimeClass_ID =
@@ -161,10 +161,10 @@ __declspec(selectany) const wchar_t* TensorRuntimeClassID<uint64_t>::RuntimeClas
   RuntimeClass_Windows_AI_MachineLearning_TensorUInt64Bit;
 __declspec(selectany) const wchar_t* TensorRuntimeClassID<int64_t>::RuntimeClass_ID =
   RuntimeClass_Windows_AI_MachineLearning_TensorInt64Bit;
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<bool>::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorBoolean;
-__declspec(selectany
-) const wchar_t* TensorRuntimeClassID<double>::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorDouble;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<bool>::RuntimeClass_ID =
+  RuntimeClass_Windows_AI_MachineLearning_TensorBoolean;
+__declspec(selectany) const wchar_t* TensorRuntimeClassID<double>::RuntimeClass_ID =
+  RuntimeClass_Windows_AI_MachineLearning_TensorDouble;
 
 template <typename T>
 struct TensorFactory {};
@@ -319,30 +319,30 @@ struct TensorFactoryIID<double> {
   static const GUID IID;
 };
 
-__declspec(selectany
-) const GUID TensorFactoryIID<float>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<float16>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int8_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint8_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint16_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int16_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint32_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int32_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<uint64_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<int64_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<bool>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics;
-__declspec(selectany
-) const GUID TensorFactoryIID<double>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics;
+__declspec(selectany) const GUID TensorFactoryIID<float>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics;
+__declspec(selectany) const GUID TensorFactoryIID<float16>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int8_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint8_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint16_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int16_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint32_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int32_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<uint64_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<int64_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics;
+__declspec(selectany) const GUID TensorFactoryIID<bool>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics;
+__declspec(selectany) const GUID TensorFactoryIID<double>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics;
 
 template <typename T>
 struct TensorFactory2IID {};
@@ -395,30 +395,30 @@ struct TensorFactory2IID<double> {
   static const GUID IID;
 };
 
-__declspec(selectany
-) const GUID TensorFactory2IID<float>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<float16>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int8_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint8_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint16_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int16_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint32_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int32_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<uint64_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<int64_t>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<bool>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics2;
-__declspec(selectany
-) const GUID TensorFactory2IID<double>::IID = ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<float>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<float16>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int8_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint8_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint16_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int16_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint32_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int32_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<uint64_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<int64_t>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<bool>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics2;
+__declspec(selectany) const GUID TensorFactory2IID<double>::IID =
+  ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics2;
 
 inline HRESULT GetActivationFactory(const wchar_t* p_class_id, const IID& iid, void** factory) noexcept {
   // Fallback to OS binary if the redistributable is not present!
diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp
index b408c0315f94a..04717c75aa150 100644
--- a/winml/test/image/imagetests.cpp
+++ b/winml/test/image/imagetests.cpp
@@ -212,13 +212,16 @@ class ImageTests : public ::testing::Test {
     const std::wstring& model_file_name, const std::wstring& image_file_name, const InputImageSource input_image_source
   ) {
     // Case that the tensor's shape doesn't match model's shape should be skipped
-    if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && (InputImageSource::FromGPUResource == input_image_source || InputImageSource::FromCPUResource == input_image_source)) {
+    if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) &&
+        (InputImageSource::FromGPUResource == input_image_source ||
+         InputImageSource::FromCPUResource == input_image_source)) {
       return true;
     }
 
     // Case that the images's shape doesn't match model's shape which expects free dimension should be skipped.
     // Because the fns-candy is not real model that can handle free dimensional input
-    if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) {
+    if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) &&
+        L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) {
       return true;
     }
 
@@ -385,7 +388,8 @@ TEST_P(ImageTest, ImageTest) {
     GTEST_SKIP() << "This test is disabled";
   }
 
-  if (LearningModelDeviceKind::Cpu != param.device_kind || InputImageSource::FromGPUResource == param.input_image_source) {
+  if (LearningModelDeviceKind::Cpu != param.device_kind ||
+      InputImageSource::FromGPUResource == param.input_image_source) {
     GPUTEST;
   }
 
@@ -482,13 +486,14 @@ TEST_P(BatchTest, BatchSupport) {
   if (param.use_session_options) {
     optimized_batch_size = param.use_session_options;
   }
-  if (VideoFrameSource::FromDirect3DSurface == param.video_frame_source && LearningModelDeviceKind::Cpu == param.device_kind) {
+  if (VideoFrameSource::FromDirect3DSurface == param.video_frame_source &&
+      LearningModelDeviceKind::Cpu == param.device_kind) {
     return;
   }
   if (LearningModelDeviceKind::Cpu != param.device_kind ||
-        VideoFrameSource::FromDirect3DSurface == param.video_frame_source ||
-        VideoFrameSource::FromDirect3DSurface == param.output_video_frame_source ||
-        VideoFrameSource::FromUnsupportedD3DSurface == param.output_video_frame_source) {
+      VideoFrameSource::FromDirect3DSurface == param.video_frame_source ||
+      VideoFrameSource::FromDirect3DSurface == param.output_video_frame_source ||
+      VideoFrameSource::FromUnsupportedD3DSurface == param.output_video_frame_source) {
     GPUTEST;
   }
 
@@ -556,7 +561,7 @@ TEST_P(BatchTest, BatchSupport) {
     for (int i = 0; i < param.batch_size; ++i) {
       std::wstring bm_image_path = FileHelpers::GetModulePath() + L"batchGroundTruth\\" + param.input_images[i];
       if (VideoFrameSource::FromSoftwareBitmap != param.output_video_frame_source &&
-                OutputBindingStrategy::Unbound != param.output_binding_strategy) {
+          OutputBindingStrategy::Unbound != param.output_binding_strategy) {
         VideoFrame D3D_video_frame = output_video_frames.GetAt(i);
         VideoFrame SB_video_frame(BitmapPixelFormat::Bgra8, 720, 720);
         D3D_video_frame.as<IVideoFrame>().CopyToAsync(SB_video_frame).get();
diff --git a/winml/test/model/compare_feature_value.cpp b/winml/test/model/compare_feature_value.cpp
index 30b16c4ad5f73..ac2553987f5ad 100644
--- a/winml/test/model/compare_feature_value.cpp
+++ b/winml/test/model/compare_feature_value.cpp
@@ -13,7 +13,8 @@ template <typename T>
 bool IsResultCloselyMatch(const T& outvalue, const T& expected_value, const double diff, const double tol) {
   if (diff > tol)
     return false;
-  if (std::isnan(diff) && !(std::isnan(outvalue) && std::isnan(expected_value)) && !(std::isinf(outvalue) && std::isinf(expected_value)))
+  if (std::isnan(diff) && !(std::isnan(outvalue) && std::isnan(expected_value)) &&
+      !(std::isinf(outvalue) && std::isinf(expected_value)))
     return false;
   return true;
 }
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index 859914014b8bb..4087bfd87caa7 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -150,7 +150,8 @@ std::string GetTestDataPath() {
   std::string testDataPath(MAX_PATH, '\0');
   auto environmentVariableFetchSuceeded =
     GetEnvironmentVariableA("WINML_TEST_DATA_PATH", testDataPath.data(), MAX_PATH);
-  if (environmentVariableFetchSuceeded == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND || environmentVariableFetchSuceeded > MAX_PATH) {
+  if (environmentVariableFetchSuceeded == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND ||
+      environmentVariableFetchSuceeded > MAX_PATH) {
     // if the WINML_TEST_DATA_PATH environment variable cannot be found, attempt to find the hardcoded models folder
     std::wstring modulePath = FileHelpers::GetModulePath();
     std::filesystem::path currPath = modulePath.substr(0, modulePath.find_last_of(L"\\"));
@@ -357,7 +358,8 @@ bool ModifyNameIfDisabledTest(/*inout*/ std::string& testName, winml::LearningMo
     if (SkipGpuTests()) {
       reason = "GPU tests are not enabled for this build.";
       shouldSkip = true;
-    } else if (disabledGpuAdapterTests.find(testName) != disabledGpuAdapterTests.end() && ShouldSkipTestOnGpuAdapter(testName)) {
+    } else if (disabledGpuAdapterTests.find(testName) != disabledGpuAdapterTests.end() &&
+               ShouldSkipTestOnGpuAdapter(testName)) {
       reason = disabledGpuAdapterTests[testName].second;
       shouldSkip = true;
     }
@@ -386,9 +388,7 @@ std::string GetFullNameOfTest(ITestCase* testCase, winml::LearningModelDeviceKin
   name += tokenizedModelPath[tokenizedModelPath.size() - 2] += "_";  // model name
   name += tokenizedModelPath[tokenizedModelPath.size() - 3];         // opset version
 
-  std::replace_if(
-    name.begin(), name.end(), [](char c) { return !absl::ascii_isalnum(c); }, '_'
-  );
+  std::replace_if(name.begin(), name.end(), [](char c) { return !absl::ascii_isalnum(c); }, '_');
 
   // Determine if test should be skipped, using the generic name (no CPU or GPU suffix yet).
   bool isDisabled = ModifyNameIfDisabledTest(/*inout*/ name, deviceKind);
diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index cf55d8bcbae7e..349332c6ae0e3 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -114,14 +114,14 @@ std::unordered_map<std::string, std::string> disabledTests({
   {             "coreml_DecisionTreeClassifier_OpenML_1464_blood_transfusion_opset7",                                                                                           disabledTestDefaultReason},
   {                                                  "coreml_AgeNet_ImageNet_opset7",                                                                                           disabledTestDefaultReason},
 
- // GPU specific cases:
+  // GPU specific cases:
 
   // ONNX zoo models
   {                                                          "mask_rcnn_opset10_GPU",
    "Bug 31005388: mask_rcnn opset 10 onnx zoo model fails to evaluate on DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005388"                                                        },
   {                                                        "faster_rcnn_opset10_GPU",
    "Bug 31005511: Failed to extract tensor data from evaluate result of faster_rcnn opset 10 model in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005511"                           },
- // ONNX model zoo's int8/qdq models generally do not work on CPUs that lack 8-bit instructions.
+  // ONNX model zoo's int8/qdq models generally do not work on CPUs that lack 8-bit instructions.
   {                                                         "YOLOv3_12_int8_opset12",                                                                                           disabledTestDefaultReason},
   {                                                            "VGG_16_int8_opset12",                                                                                           disabledTestDefaultReason},
   {                                                               "SSD_int8_opset12",                                                                                           disabledTestDefaultReason},
@@ -137,7 +137,7 @@ std::unordered_map<std::string, std::string> disabledTests({
   {                                                 "EfficientNet_Lite4_qdq_opset11",                                                                                           disabledTestDefaultReason},
   {                                                "EfficientNet_Lite4_int8_opset11",                                                                                           disabledTestDefaultReason},
 
- // Tier 2 models
+  // Tier 2 models
   {                                               "fp16_test_tiny_yolov2_opset7_GPU",
    "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
   {                                                    "fp16_tiny_yolov2_opset8_GPU",
diff --git a/winml/test/scenario/cppwinrt/NoisyReluCpu.h b/winml/test/scenario/cppwinrt/NoisyReluCpu.h
index 5cccbae67407c..e419205fd52dc 100644
--- a/winml/test/scenario/cppwinrt/NoisyReluCpu.h
+++ b/winml/test/scenario/cppwinrt/NoisyReluCpu.h
@@ -65,12 +65,14 @@ struct NoisyReluOperator : winrt::implements<NoisyReluOperator, IMLOperatorKerne
       }
 
       // If the tensor types are both float type
-      if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) {
+      if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float &&
+          inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) {
         // For cpu data
         if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) {
           ComputeInternal<float>(inputTensor.get(), outputTensor.get(), inputDataSize);
         }
-      } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) {
+      } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double &&
+                 inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) {
         // For cpu data
         if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) {
           ComputeInternal<double>(inputTensor.get(), outputTensor.get(), inputDataSize);
diff --git a/winml/test/scenario/cppwinrt/ReluCpu.h b/winml/test/scenario/cppwinrt/ReluCpu.h
index 7bb275f7b399b..e8e91489fe872 100644
--- a/winml/test/scenario/cppwinrt/ReluCpu.h
+++ b/winml/test/scenario/cppwinrt/ReluCpu.h
@@ -60,12 +60,14 @@ struct ReluOperator : winrt::implements<ReluOperator, IMLOperatorKernel> {
     }
 
     // If the tensor types are both float type
-    if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) {
+    if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float &&
+        inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) {
       // For cpu data
       if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) {
         ComputeInternal<float>(inputTensor.get(), outputTensor.get(), inputDataSize);
       }
-    } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) {
+    } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double &&
+               inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) {
       // For cpu data
       if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) {
         ComputeInternal<double>(inputTensor.get(), outputTensor.get(), inputDataSize);

From f4edf9bb58911da401c128c70e088051bfbf93c5 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 24 Jul 2024 16:39:32 -0700
Subject: [PATCH 12/57] Extend QDQPropagation transformer to handle multiple
 consumers (#21313)

### Description
- Extends the QDQPropagationTransformer to propagate DQs (forward)
across operators with multiple consumers (previously only supported 1
consumer).
- Adds Slice to the list of operators that the QDQPropagationTransformer
can propagate DQ/Q ops across.
- Supports QDQ propagation for opset 21.
- Correctly copies Q or DQ attributes when creating new nodes.


### Motivation and Context
The QDQPropagationTransformer fixes up QDQ node units for certain "data
movement" ops (e.g., Transpose) by inserting Q -> DQ sequences where
necessary. For example, the sequence `DQ -> Transpose -> Sigmoid` is
transformed to `DQ -> Transpose -> Q -> DQ -> Sigmoid`.

However, this fix-up does not currently support data movement ops with
multiple consumers, as in:
```
DQ -> Transpose --+--> Sigmoid ->
                  |
                  +--> Relu ->
                  |
                  +-> graph_output
```

With the updates in this PR, the above model can be transformed to:
```
DQ -> Transpose -> Q --+--> DQ -> Sigmoid ->
                       |
                       +--> DQ -> Relu ->
                       |
                       +--> DQ -> graph_output
```

This update allows QNN EP to support quantized models created with tools
that do not wrap data movement ops in Q/DQ ops.

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../qdq_transformer/qdq_propagation.cc        | 341 +++++++++++++-----
 .../optimizer/graph_transform_test_builder.cc |   4 +-
 .../test/optimizer/qdq_transformer_test.cc    | 168 +++++++++
 3 files changed, 420 insertions(+), 93 deletions(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
index f0e76312d6e00..7b518947138a5 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc
@@ -3,8 +3,13 @@
 
 #include "core/optimizer/qdq_transformer/qdq_propagation.h"
 
+#include <cassert>
 #include <optional>
+#include <queue>
+#include <sstream>
+#include <utility>
 
+#include "core/common/inlined_containers_fwd.h"
 #include "core/graph/extended_graph_edge.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
@@ -17,39 +22,147 @@ namespace onnxruntime {
 namespace {
 bool CanNodePropagate(const Node& node) {
   return graph_utils::IsSupportedOptypeVersionAndDomain(node, "MaxPool", {12}) ||
-         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Reshape", {5, 13, 14, 19}) ||
-         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Transpose", {1, 13}) ||
-         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Squeeze", {1, 11, 13}) ||
-         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Unsqueeze", {1, 11, 13});
+         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Reshape", {5, 13, 14, 19, 21}) ||
+         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Transpose", {1, 13, 21}) ||
+         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Squeeze", {1, 11, 13, 21}) ||
+         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Unsqueeze", {1, 11, 13, 21}) ||
+         graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {1, 10, 11, 13});
 }
 
-// convert this: src_node -> dst_node
-// to this:      src_node -> Q -> DQ -> dst_node
-// assumptions:
-// 1. insertion_edge is valid - node indexes refer to valid nodes, arg name refers to a valid NodeArg, and it
-//    corresponds to an actual graph relationship
-// 2. scale_initializer_nodearg and zp_initializer_nodearg_ptr (if not null) are constant initializers
-Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge,
-                     NodeArg& scale_initializer_nodearg, NodeArg* zp_initializer_nodearg_ptr,
-                     const std::string& qdq_domain, const logging::Logger& logger) {
-  auto* src_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Source);
-  auto* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
-
-  ORT_ENFORCE(src_node || dst_node, "At least one graph node must be specified in the propagation edge.");
-
-  const auto& base_name = insertion_edge.arg_name;
+// Makes matching attributes for new QuantizeLinear nodes from an existing DequantizeLinear node.
+NodeAttributes MakeQAttrsFromDQ(const Node& dq_node) {
+  assert(dq_node.SinceVersion() <= 21);  // Checked by previous call to QDQ::MatchDQNode().
+  // In opset <= 21, all DQ attributes (i.e., axis and block_size) are also Q attributes.
+  // So, set a copy of the DQ attributes.
+  return dq_node.GetAttributes();
+}
+
+// Makes matching attributes for new DequantizeLinear nodes from an existing QuantizeLinear node.
+NodeAttributes MakeDQAttrsFromQ(const Node& q_node) {
+  assert(q_node.SinceVersion() <= 21);  // Checked by previous call to QDQ::MatchQNode().
+  const NodeAttributes& q_attrs = q_node.GetAttributes();
+  if (q_attrs.empty()) {
+    return {};
+  }
+
+  // In opset <= 21, only the "axis" and "block_size" attributes for Q are also DQ attributes.
+  NodeAttributes dq_attrs;
+
+  auto axis_attr_it = q_attrs.find("axis");
+  if (axis_attr_it != q_attrs.end()) {
+    dq_attrs.insert({axis_attr_it->first, axis_attr_it->second});
+  }
+
+  auto block_size_attr_it = q_attrs.find("block_size");
+  if (block_size_attr_it != q_attrs.end()) {
+    dq_attrs.insert({block_size_attr_it->first, block_size_attr_it->second});
+  }
+
+  return dq_attrs;
+}
+
+// Validates edges into which to insert Q -> DQ ops.
+// - Must have at least one edge.
+// - All edges must correspond to the same graph NodeArg (i.e., same source but potentially different destination).
+// - All edges must be attached to either a source node or a destination node.
+Status ValidateQDQInsertionEdges(Graph& graph, gsl::span<const ExtendedGraphEdge> insertion_edges) {
+  const size_t num_edges = insertion_edges.size();
+  ORT_RETURN_IF(num_edges == 0, "Expected at least one edge into which to insert QDQ pair.");
+
+  const ExtendedGraphEdge& first_edge = insertion_edges[0];
+  const Node* src_node = first_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Source);
+  const Node* first_dst_node = first_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+  const std::string& node_arg_name = first_edge.arg_name;
+  ORT_RETURN_IF_NOT(graph.GetNodeArg(node_arg_name) != nullptr,
+                    "QDQ insertion edge does not have a valid graph NodeArg for ", node_arg_name);
+  ORT_RETURN_IF_NOT(src_node != nullptr || first_dst_node != nullptr,
+                    "QDQ insertion edge [0] for NodeArg ", node_arg_name,
+                    " must have a source or a destination node");
+
+  for (size_t i = 1; i < num_edges; i++) {
+    const ExtendedGraphEdge& insertion_edge = insertion_edges[i];
+    ORT_RETURN_IF_NOT(insertion_edge.arg_name == node_arg_name,
+                      "QDQ insertion edge [", i, "] has NodeArg ", insertion_edge.arg_name,
+                      " but expected NodeArg ", node_arg_name);
+
+    const Node* edge_dst_node = insertion_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+    ORT_RETURN_IF_NOT(src_node != nullptr || edge_dst_node != nullptr,
+                      "QDQ insertion edge [", i, "] for NodeArg ", node_arg_name,
+                      " must have a source or a destination node");
+  }
+
+  return Status::OK();
+}
+
+// Logs information about the edges into which Q/DQ nodes will be inserted in InsertQDQPairs().
+// Assumes the edges have already been validated.
+void LogQDQInsertion(const logging::Logger& logger, logging::Severity severity, const CodeLocation& code_location,
+                     const Graph& graph, gsl::span<const ExtendedGraphEdge> edges) {
+  auto logging_data_type = logging::DataType::SYSTEM;
+  if (!logger.OutputIsEnabled(severity, logging_data_type)) {
+    return;
+  }
+
+  const Node* src_node = edges[0].GetNodeAtEnd(graph, ExtendedGraphEdge::End::Source);
+  const auto& node_arg_name = edges[0].arg_name;
+  std::string src_label = src_node ? MakeString("node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")")
+                                   : "input";
+  std::ostringstream dst_labels;
+  const size_t num_edges = edges.size();
+
+  for (size_t i = 0; i < num_edges; ++i) {
+    const ExtendedGraphEdge& edge = edges[i];
+    const Node* dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+    dst_labels << (dst_node ? MakeString("dst node (\"", dst_node->Name(), "\", index: ", dst_node->Index(), ")")
+                            : "output")
+               << (i == num_edges - 1 ? "" : ",");
+  }
+
+  logging::Capture(logger, severity, logging::Category::onnxruntime, logging_data_type, code_location).Stream()
+      << "Inserted Q/DQ pair between "
+      << (src_node ? MakeString("src node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")")
+                   : "input")
+      << " and " << dst_labels.str()
+      << " at NodeArg \"" << node_arg_name << "\".";
+}
+
+// convert this: src_node (or graph input) --+--> dst_node_0 (or graph output)
+//                                           |
+//                                           +--> dst_node_1
+//                                           |    ...
+//                                           +--> dst_node_n
+//
+// to this: src_node (or graph input) -> Q --+--> DQ -> dst_node_0 (or graph output)
+//                                           |
+//                                           +--> DQ -> dst_node_1
+//                                           |    ...
+//                                           +--> DQ -> dst_node_n
+// Checks that all insertion edges share the same NodeArg. That is, the edges originate from the same source node
+// output. If there is no src_node, then all edges should come from the same graph input.
+// This function returns an error status if edges are invalid.
+//
+// Assumes that scale_initializer_nodearg and zp_initializer_nodearg_ptr (if not null) are constant initializers.
+Status InsertQDQPairs(Graph& graph, gsl::span<const ExtendedGraphEdge> insertion_edges,
+                      NodeArg& scale_initializer_nodearg, NodeArg* zp_initializer_nodearg_ptr,
+                      const std::string& qdq_domain, const NodeAttributes& q_attrs, const NodeAttributes& dq_attrs,
+                      const logging::Logger& logger) {
+  ORT_RETURN_IF_ERROR(ValidateQDQInsertionEdges(graph, insertion_edges));
+
+  const ExtendedGraphEdge& first_edge = insertion_edges[0];  // ValidateQDQInsertionEdges() guarantees at least one edge
+
+  Node* src_node = first_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Source);  // nullptr for graph input
+  const auto& base_name = first_edge.arg_name;
   auto& base_node_arg = *graph.GetNodeArg(base_name);
 
-  LOGS(logger, VERBOSE) << "Inserting Q/DQ pair between "
-                        << (src_node ? MakeString("node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")")
-                                     : "input")
-                        << " and "
-                        << (dst_node ? MakeString("node (\"", dst_node->Name(), "\", index: ", dst_node->Index(), ")")
-                                     : "output")
-                        << " at NodeArg \"" << base_name << "\".";
+  LogQDQInsertion(logger, logging::Severity::kVERBOSE, ORT_WHERE, graph, insertion_edges);
 
-  // set up new NodeArgs
-  auto& pre_q_nodearg = insertion_edge.HasGraphInputOrInitializer()
+  auto make_q_or_dq_inputs = [](NodeArg& data, NodeArg& scale, NodeArg* zero_point) {
+    return zero_point ? InlinedVector<NodeArg*>{&data, &scale, zero_point}
+                      : InlinedVector<NodeArg*>{&data, &scale};
+  };
+
+  // Create Q node that will be inserted after src_node
+  auto& pre_q_nodearg = first_edge.HasGraphInputOrInitializer()
                             ? base_node_arg
                             : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_pre_q"),
                                                        nullptr);
@@ -57,17 +170,6 @@ Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge,
   auto& q_to_dq_nodearg = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_q_to_dq"),
                                                    nullptr);
 
-  auto& post_dq_nodearg = insertion_edge.HasGraphOutput()
-                              ? base_node_arg
-                              : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_post_dq"),
-                                                         nullptr);
-
-  // set up new Nodes
-  auto make_q_or_dq_inputs = [](NodeArg& data, NodeArg& scale, NodeArg* zero_point) {
-    return zero_point ? std::vector<NodeArg*>{&data, &scale, zero_point}
-                      : std::vector<NodeArg*>{&data, &scale};
-  };
-
   auto& q_node = graph.AddNode(graph.GenerateNodeName(base_name + "_q"),
                                QDQ::QOpName,
                                "Inserted by QDQPropagationTransformer",
@@ -76,40 +178,61 @@ Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge,
                                                    zp_initializer_nodearg_ptr),
                                // outputs
                                {&q_to_dq_nodearg},
-                               nullptr,  // attributes
+                               &q_attrs,  // attributes
                                qdq_domain);
 
   ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(q_node), "Failed to set op schema for added Q node.");
 
-  auto& dq_node = graph.AddNode(graph.GenerateNodeName(base_name + "_dq"),
-                                QDQ::DQOpName,
-                                "Inserted by QDQPropagationTransformer",
-                                // inputs
-                                make_q_or_dq_inputs(q_to_dq_nodearg, scale_initializer_nodearg,
-                                                    zp_initializer_nodearg_ptr),
-                                // outputs
-                                {&post_dq_nodearg},
-                                nullptr,  // attributes
-                                qdq_domain);
-
-  ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(dq_node), "Failed to set op schema for added DQ node.");
-
-  // set up edges
-  if (src_node && dst_node) {
-    graph.RemoveEdge(src_node->Index(), dst_node->Index(),
-                     insertion_edge.src->arg_idx, insertion_edge.dst->arg_idx);
-  }
-
   if (src_node) {
-    src_node->MutableOutputDefs()[insertion_edge.src->arg_idx] = &pre_q_nodearg;
-    graph.AddEdge(src_node->Index(), q_node.Index(), insertion_edge.src->arg_idx, 0);
-  }
+    // Remove original edges between src and dst nodes.
+    for (const auto& insertion_edge : insertion_edges) {
+      auto* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+
+      if (dst_node) {
+        graph.RemoveEdge(src_node->Index(), dst_node->Index(),
+                         insertion_edge.src->arg_idx, insertion_edge.dst->arg_idx);
+      }
+    }
 
-  graph.AddEdge(q_node.Index(), dq_node.Index(), 0, 0);
+    // Add edge from src to Q node.
+    src_node->MutableOutputDefs()[first_edge.src->arg_idx] = &pre_q_nodearg;
+    graph.AddEdge(src_node->Index(), q_node.Index(), first_edge.src->arg_idx, 0);
+  }
 
-  if (dst_node) {
-    dst_node->MutableInputDefs()[insertion_edge.dst->arg_idx] = &post_dq_nodearg;
-    graph.AddEdge(dq_node.Index(), dst_node->Index(), 0, insertion_edge.dst->arg_idx);
+  // Create a DQ node for each dst node and connect remaining edges.
+  for (size_t edge_idx = 0; edge_idx < insertion_edges.size(); ++edge_idx) {
+    const auto& insertion_edge = insertion_edges[edge_idx];
+    const std::string edge_suffix = edge_idx == 0 ? "" : std::to_string(edge_idx);
+    auto& post_dq_nodearg = insertion_edge.HasGraphOutput()
+                                ? base_node_arg
+                                : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(MakeString(base_name,
+                                                                                                "_post_dq",
+                                                                                                edge_suffix)),
+                                                           nullptr);
+
+    auto& dq_node = graph.AddNode(graph.GenerateNodeName(MakeString(base_name, "_dq", edge_suffix)),
+                                  QDQ::DQOpName,
+                                  "Inserted by QDQPropagationTransformer",
+                                  // inputs
+                                  make_q_or_dq_inputs(q_to_dq_nodearg, scale_initializer_nodearg,
+                                                      zp_initializer_nodearg_ptr),
+                                  // outputs
+                                  {&post_dq_nodearg},
+                                  &dq_attrs,  // attributes
+                                  qdq_domain);
+
+    ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(dq_node), "Failed to set op schema for added DQ node.");
+
+    Node* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+
+    // Add edge from Q to DQ
+    graph.AddEdge(q_node.Index(), dq_node.Index(), 0, 0);
+
+    // Add edge from DQ to dst_node
+    if (dst_node) {
+      dst_node->MutableInputDefs()[insertion_edge.dst->arg_idx] = &post_dq_nodearg;
+      graph.AddEdge(dq_node.Index(), dst_node->Index(), 0, insertion_edge.dst->arg_idx);
+    }
   }
 
   return Status::OK();
@@ -156,37 +279,39 @@ std::optional<ExtendedGraphEdge> GetPreviousPropagationEdge(const Graph& graph,
   return GetPreviousEdge(graph, *src_node);
 }
 
-std::optional<ExtendedGraphEdge> GetNextEdge(const Graph& graph, const Node& node) {
-  // for now we can just consider the first output (index 0)
+InlinedVector<ExtendedGraphEdge> GetNextEdges(const Graph& graph, const Node& node) {
+  constexpr int node_output_index = 0;  // for now we can just consider the first output (index 0)
+  InlinedVector<ExtendedGraphEdge> next_edges;
+  const auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(node, static_cast<size_t>(node_output_index));
 
-  const auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(node, 0);
-  if (output_edges.empty()) {
-    // maybe edge to output
-    return ExtendedGraphEdge::TryCreateFromNodeToOutput(graph, node, 0);
+  // edges to next nodes
+  for (const auto& output_edge : output_edges) {
+    next_edges.push_back(ExtendedGraphEdge::CreateFromValidGraphEdge(output_edge));
   }
 
-  if (!graph.IsOutput(node.OutputDefs()[0]) && output_edges.size() == 1) {
-    // single edge to next node
-    return ExtendedGraphEdge::CreateFromValidGraphEdge(output_edges.front());
+  // maybe edge to graph output
+  auto edge_to_output = ExtendedGraphEdge::TryCreateFromNodeToOutput(graph, node, node_output_index);
+  if (edge_to_output.has_value()) {
+    next_edges.push_back(edge_to_output.value());
   }
 
-  return std::nullopt;
+  return next_edges;
 }
 
-std::optional<ExtendedGraphEdge> GetNextPropagationEdge(const Graph& graph,
-                                                        const ExtendedGraphEdge& edge) {
+InlinedVector<ExtendedGraphEdge> GetNextPropagationEdges(const Graph& graph,
+                                                         const ExtendedGraphEdge& edge) {
   if (edge.HasGraphOutput()) {
-    return std::nullopt;
+    return {};
   }
 
   const auto* dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
   ORT_ENFORCE(dst_node != nullptr);
 
   if (!CanNodePropagate(*dst_node)) {
-    return std::nullopt;
+    return {};
   }
 
-  return GetNextEdge(graph, *dst_node);
+  return GetNextEdges(graph, *dst_node);
 }
 
 class GraphConstantInitializerGetter {
@@ -228,21 +353,54 @@ Status PropagateDQForward(Graph& graph, gsl::span<const NodeIndex> node_indices,
                               ? dq_node.MutableInputDefs()[QDQ::InputIndex::ZERO_POINT_ID]
                               : nullptr;
 
-    const auto edge_after_dq = GetNextEdge(graph, dq_node);
-    if (!edge_after_dq) {
+    const InlinedVector<ExtendedGraphEdge> edges_after_dq = GetNextEdges(graph, dq_node);
+    if (edges_after_dq.size() != 1) {
       continue;
     }
 
-    for (auto curr_edge = GetNextPropagationEdge(graph, *edge_after_dq);
-         curr_edge.has_value();
-         curr_edge = GetNextPropagationEdge(graph, *curr_edge)) {
-      if (const auto* dst_node = curr_edge->GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
-          dst_node && QDQ::MatchQNode(*dst_node)) {
-        break;
+    // Utility function to check if any edge out of a node (e.g., Transpose) ends in a Q node.
+    auto any_edge_ends_in_q = [](Graph& graph, const InlinedVector<ExtendedGraphEdge>& edges) -> bool {
+      for (const auto& edge : edges) {
+        const auto* edge_dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination);
+        if (edge_dst_node && QDQ::MatchQNode(*edge_dst_node)) {
+          return true;
+        }
+      }
+      return false;
+    };
+
+    // Propagate DQ forward in a BFS traversal of NodeArg edges. A NodeArg "edge group" consists of one or more edges
+    // that all begin at the same source node's output slot and end at a graph output or a destination node.
+    // Ex: The subgraph below shows a NodeArg edge group (containing 3 edges) that begins at a
+    // Transpose, ends at two destination nodes, and produces a graph output.
+    //    DQ -> Transpose --+--> Sigmoid -> ...
+    //                      |
+    //                      +--> Slice -> ...
+    //                      |
+    //                      +--> graph_output
+    std::queue<InlinedVector<ExtendedGraphEdge>> node_arg_edges;
+    node_arg_edges.push(GetNextPropagationEdges(graph, edges_after_dq[0]));
+
+    while (!node_arg_edges.empty()) {
+      const InlinedVector<ExtendedGraphEdge> curr_edge_group = std::move(node_arg_edges.front());
+      node_arg_edges.pop();
+
+      // Skip if edge group is empty. Also, to keep things simple, we do not yet handle edge groups in which
+      // one of the destination nodes is already a QuantizeLinear node. Ex:
+      //    DQ -> Transpose --+--> QuantizeLinear -> ...
+      //                      |
+      //                      +--> Slice -> ...
+      if (curr_edge_group.empty() || any_edge_ends_in_q(graph, curr_edge_group)) {
+        continue;
       }
 
-      ORT_RETURN_IF_ERROR(InsertQDQPair(graph, *curr_edge, dq_scale, dq_zero_point, dq_node.Domain(), logger));
+      ORT_RETURN_IF_ERROR(InsertQDQPairs(graph, curr_edge_group, dq_scale, dq_zero_point, dq_node.Domain(),
+                                         MakeQAttrsFromDQ(dq_node), dq_node.GetAttributes(), logger));
       modified = true;
+
+      for (const auto& edge : curr_edge_group) {
+        node_arg_edges.push(GetNextPropagationEdges(graph, edge));
+      }
     }
   }
 
@@ -290,7 +448,8 @@ Status PropagateQBackward(Graph& graph, gsl::span<const NodeIndex> node_indices,
         break;
       }
 
-      ORT_RETURN_IF_ERROR(InsertQDQPair(graph, *curr_edge, q_scale, q_zero_point, q_node.Domain(), logger));
+      ORT_RETURN_IF_ERROR(InsertQDQPairs(graph, InlinedVector<ExtendedGraphEdge>{*curr_edge}, q_scale, q_zero_point,
+                                         q_node.Domain(), q_node.GetAttributes(), MakeDQAttrsFromQ(q_node), logger));
       modified = true;
     }
   }
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
index 2cbfbbb317642..03a71868a3dc1 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
@@ -246,14 +246,14 @@ Status TestGraphTransformer(const std::function<void(ModelTestBuilder& helper)>&
       ORT_RETURN_IF_ERROR(pre_graph_checker(graph));
     }
 #if SAVE_TEST_GRAPH
-    ORT_RETURN_IF_ERROR(Model::Save(model, "model_original.onnx"));
+    ORT_RETURN_IF_ERROR(Model::Save(model, ToPathString("model_original.onnx")));
 #endif
     ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger));
     if (post_graph_checker) {
       ORT_RETURN_IF_ERROR(post_graph_checker(graph));
     }
 #if SAVE_TEST_GRAPH
-    ORT_RETURN_IF_ERROR(Model::Save(model, "model_optimized.onnx"));
+    ORT_RETURN_IF_ERROR(Model::Save(model, ToPathString("model_optimized.onnx")));
 #endif
   };
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 14c5b60d6e0bd..fb85eb4c29bb6 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -12,6 +12,7 @@
 #include "core/mlas/inc/mlas.h"
 #include "core/optimizer/double_qdq_pairs_remover.h"
 #include "core/optimizer/qdq_transformer/qdq_final_cleanup.h"
+#include "core/optimizer/qdq_transformer/qdq_propagation.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
@@ -3084,6 +3085,57 @@ TEST(QDQTransformerTests, QDQPropagation_QBackward) {
 #endif
 }
 
+// Test backwards propagation of a QuantizeLinear node that uses the "output_dtype" attribute
+// to set the quantization type (i.e., does not have an explicit zero-point input). This tests
+// the copying of attributes for QDQ propagation.
+TEST(QDQTransformerTests, QDQPropagation_QBackward_NoZP_OutputDtypeAttribute) {
+  auto test_case = [&](ONNX_NAMESPACE::TensorProto_DataType q_output_type) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input_arg = builder.MakeInput<float>({1, 2, 2}, {-2.0f, 0.0f, 1.0f, 2.0f});
+      auto* output_arg = builder.MakeOutput();
+
+      // add Add
+      auto* const_1_input = builder.MakeScalarInitializer<float>(1.0f);
+      auto* add_output = builder.MakeIntermediate();
+      builder.AddNode("Add", {input_arg, const_1_input}, {add_output});
+
+      // add Transpose
+      auto* transpose_output = builder.MakeIntermediate();
+      builder.AddNode("Transpose", {add_output}, {transpose_output});
+
+      // add Q with a "output_dtype" attribute. Omit the zero-point input (defaults to 0).
+      constexpr float qdq_scale = 1.0f;
+      Node& q_node = builder.AddQuantizeLinearNode(transpose_output, qdq_scale, output_arg);
+      q_node.AddAttribute("output_dtype", static_cast<int64_t>(q_output_type));
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+      std::vector<std::string> expected_op_types_in_order = {
+          "Add",
+          qdq_keys.quantize_linear,
+          qdq_keys.dequantize_linear,
+          "Transpose",
+          qdq_keys.quantize_linear,
+      };
+
+      const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph(), true);
+      EXPECT_EQ(op_types_in_order, expected_op_types_in_order);
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Default,
+                      TransformerLevel::Level1,
+                      21);  // Opset >= 21 supports the "output_dtype" attribute
+  };
+
+  test_case(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
+  test_case(ONNX_NAMESPACE::TensorProto_DataType_INT8);
+  test_case(ONNX_NAMESPACE::TensorProto_DataType_UINT16);
+  test_case(ONNX_NAMESPACE::TensorProto_DataType_INT16);
+}
+
 TEST(QDQTransformerTests, QDQPropagation_DQForward) {
   auto test_case = [&](const std::vector<int64_t>& input_shape,
                        size_t maxpool_dim,
@@ -3420,6 +3472,122 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) {
 #endif
 }
 
+// Test propagating a DQ forward through a chain of Slice and Transpose operators that have multiple consumers.
+// original model:
+//   in0 -> DQ -> Slice --+--> slice_out
+//                        |
+//                        +--> Add -> out0
+//                        |
+//                        +--> Transpose --+--> Pow -> out1
+//                        |                |
+//                        |                +--> Pow -> out2
+//                        |
+//                        +--> Transpose --+--> Pow -> out3
+//                                         |
+//                                         +--> Pow -> out4
+// expected model:
+//   in0 -> DQ -> Slice -> Q --+--> DQ -> slice_out
+//                             |
+//                             +--> DQ -> Add -> out0
+//                             |
+//                             +--> DQ -> TP -> Q --+--> DQ -> Pow -> out1
+//                             |                    |
+//                             |                    +--> DQ -> Pow -> out2
+//                             |
+//                             +--> DQ -> TP -> Q --+--> DQ -> Pow -> out3
+//                                                  |
+//                                                  +--> DQ -> Pow -> out4
+TEST(QDQTransformerTests, QDQPropagation_DQForward_SliceMultipleConsumers) {
+  auto run_test_case = [&](bool slice_has_graph_output) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      std::vector<int64_t> input0_shape = {1, 2, 2, 2};
+      std::vector<int64_t> input1_shape = {1, 1, 1, 1};
+      auto* input0_arg = builder.MakeInput<uint8_t>(input0_shape,
+                                                    std::numeric_limits<uint8_t>::min(),
+                                                    std::numeric_limits<uint8_t>::max());
+      auto* input1_arg = builder.MakeInput<float>(input1_shape, {0.0f});
+      auto* output0_arg = builder.MakeOutput();
+      auto* output1_arg = builder.MakeOutput();
+      auto* output2_arg = builder.MakeOutput();
+      auto* output3_arg = builder.MakeOutput();
+      auto* output4_arg = builder.MakeOutput();
+
+      // DQ
+      constexpr float qdq_scale = 1.0f;
+      constexpr uint8_t qdq_zero_point = 128;
+      auto* dq_output = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<uint8_t>(input0_arg, qdq_scale, qdq_zero_point, dq_output);
+
+      // Slice
+      auto* slice_output = slice_has_graph_output ? builder.MakeOutput() : builder.MakeIntermediate();
+      auto* slice_starts = builder.Make1DInitializer(std::vector<int64_t>{0, 0, 0, 0});
+      auto* slice_ends = builder.Make1DInitializer(std::vector<int64_t>{1, 1, 1, 1});
+      builder.AddNode("Slice", {dq_output, slice_starts, slice_ends}, {slice_output});
+
+      // Add
+      builder.AddNode("Add", {slice_output, input1_arg}, {output0_arg});
+
+      // Transpose
+      auto* transpose0_output = builder.MakeIntermediate();
+      builder.AddNode("Transpose", {slice_output}, {transpose0_output});
+
+      // Transpose
+      auto* transpose1_output = builder.MakeIntermediate();
+      builder.AddNode("Transpose", {slice_output}, {transpose1_output});
+
+      // Pows
+      auto* pow_exp = builder.MakeScalarInitializer(2.0f);
+      builder.AddNode("Pow", {transpose0_output, pow_exp}, {output1_arg});
+      builder.AddNode("Pow", {transpose0_output, pow_exp}, {output2_arg});
+      builder.AddNode("Pow", {transpose1_output, pow_exp}, {output3_arg});
+      builder.AddNode("Pow", {transpose1_output, pow_exp}, {output4_arg});
+    };
+
+    auto check_graph = [&](InferenceSessionWrapper& session) {
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+      std::vector<std::string> expected_op_types_in_order;
+      expected_op_types_in_order.reserve(20);
+      expected_op_types_in_order.insert(expected_op_types_in_order.end(),
+                                        {qdq_keys.dequantize_linear,
+                                         "Slice",
+                                         qdq_keys.quantize_linear});
+
+      if (slice_has_graph_output) {
+        // Should have a DQ before the graph output generated by the Slice.
+        expected_op_types_in_order.push_back(qdq_keys.dequantize_linear);
+      }
+
+      expected_op_types_in_order.insert(expected_op_types_in_order.end(),
+                                        {qdq_keys.dequantize_linear,
+                                         "Add",
+                                         qdq_keys.dequantize_linear,
+                                         "Transpose",
+                                         qdq_keys.quantize_linear, qdq_keys.dequantize_linear,
+                                         "Pow",
+                                         qdq_keys.dequantize_linear,
+                                         "Pow",
+                                         qdq_keys.dequantize_linear,
+                                         "Transpose",
+                                         qdq_keys.quantize_linear, qdq_keys.dequantize_linear,
+                                         "Pow",
+                                         qdq_keys.dequantize_linear,
+                                         "Pow"});
+
+      const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph(), true);
+      EXPECT_EQ(op_types_in_order, expected_op_types_in_order);
+    };
+
+    TransformerTester(build_test_case,
+                      check_graph,
+                      TransformerLevel::Default,
+                      TransformerLevel::Level1,
+                      18, 0.0, 0.0, std::make_unique<QDQPropagationTransformer>());
+  };
+
+  run_test_case(/*slice_has_graph_output*/ false);
+  run_test_case(/*slice_has_graph_output*/ true);
+}
+
 TEST(QDQTransformerTests, QDQ_Selector_Test) {
   const ORTCHAR_T* model_file_name = ORT_TSTR("testdata/transform/qdq_conv.onnx");
 

From 08001d18ac41ee2fe95ce9d4d064c2fb725e583f Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 25 Jul 2024 08:25:22 +0800
Subject: [PATCH 13/57] Fix security issue #22016 #22017 #22018 (#21333)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../memory_optimizer/recompute_analysis.cc    | 814 +++++++++---------
 .../training_api/core/training_api_tests.cc   |   3 +-
 .../orttraining/training_api/checkpoint.cc    |   2 +-
 3 files changed, 416 insertions(+), 403 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 8d110c692751e..1135ef41cfc47 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -67,410 +67,422 @@ using OpsetToIgnorableIndicesMap = InlinedHashMap<int, IgnorableInputIndices>;
  *   or not.
  * 3. Some ops are not supported in older opsets, we need to check whether it is applicable to recompute or not.
  */
-const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& GetAllowedRecomputeOps(int probe_op_level) {
-  static InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>> recomputable_op_table_map;
-  if (recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end()) {
-    return recomputable_op_table_map.at(probe_op_level);
-  }
+InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>> InitializeRecomputableOpTable() {
+  InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>> recomputable_op_table_map;
+
+  constexpr const int basic_op_level = static_cast<int>(ProbeLevel::Basic);
+  recomputable_op_table_map.insert({basic_op_level, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>()});
+  auto& basic_recomputable_op_table = recomputable_op_table_map.at(basic_op_level);
+
+  basic_recomputable_op_table.insert({
+      {
+          utils::GetFullQualifiedOpName("Add", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {13, {}},
+              {14, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {9, {}},
+              {14, {}},
+              {15, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BiasDropout", kMSDomain),
+          {
+              {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain),
+          {
+              {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain),
+          {
+              {1, {1, 2}},  // ignore ratio (optional) and training mode (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {9, {}},
+              {13, {}},
+              {19, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain),
+          {
+              {1, {}},
+
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain),
+          {
+              {9, {0}},  // ignore the `input`, e.g. the shape of the expected output tensor
+              {20, {0}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Cos", kOnnxDomain),
+          {
+              {7, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("CumSum", kOnnxDomain),
+          {
+              // The axis input is trivial
+              {11, {1}},
+              {14, {1}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
+          {
+              // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
+              {12, {1, 2}},  // ignore ratio and training_mode
+              {13, {1, 2}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Div", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {13, {}},
+              {14, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Einsum", kOnnxDomain),
+          {
+              {12, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Equal", kOnnxDomain),
+          {
+              {1, {}},
+              {7, {}},
+              {11, {}},
+              {13, {}},
+              {19, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Expand", kOnnxDomain),
+          {
+              {8, {1}},  // Ignore the shape.
+              {13, {1}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("FastGelu", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("FlattenAndUnpad", kMSDomain),
+          {
+              {1, {1}},  // ignore the indices
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Gather", kOnnxDomain),
+          {
+              {1, {1}},  // ignore the indices
+              {11, {1}},
+              {13, {1}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Gelu", kOnnxDomain),
+          {
+              {20, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Gelu", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Gemm", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {9, {}},
+              {11, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Less", kOnnxDomain),
+          {
+              {1, {}},
+              {7, {}},
+              {9, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("MemcpyFromHost", kOnnxDomain),
+          {
+              {1, {0}},  // Ignore CPU input.
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Mul", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {13, {}},
+              {14, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Neg", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("NonZero", kOnnxDomain),
+          {
+              {9, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("PadAndUnflatten", kMSDomain),
+          {
+              {1, {1, 2}},  // ignore the indices and unflatten_dims
+          },
+      },
+      {
+          // Be noted, NOT all PythonOp will be allowed to recompute, there will be further check.
+          utils::GetFullQualifiedOpName("PythonOp", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Range", kOnnxDomain),
+          {
+              {11, {0, 1, 2}},  // ignore start, end, delta, because they are scalars.
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
+          {
+              {1, {}},
+              {5, {}},  // ignore the shape.
+              {13, {}},
+              {14, {}},
+              {19, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Sin", kOnnxDomain),
+          {
+              {7, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Slice", kOnnxDomain),
+          {
+              {1, {}},
+              {10, {1, 2, 3, 4}},  // ignore starts, ends, axes (optional) and steps (optional)
+              {11, {1, 2, 3, 4}},
+              {13, {1, 2, 3, 4}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Split", kOnnxDomain),
+          {
+              {1, {1}},  // ignore split (optional)
+              {2, {}},
+              {11, {}},
+              {13, {1}},  // ignore the split (optional)
+              {18, {1}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain),
+          {
+              {1, {}},
+              {11, {}},
+              {13, {1}},  // ignore the axes (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Sub", kOnnxDomain),
+          {
+              {1, {}},
+              {6, {}},
+              {7, {}},
+              {13, {}},
+              {14, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Tile", kOnnxDomain),
+          {
+              {1, {1, 2}},
+              {6, {1}},
+              {13, {1}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
+          {
+              {1, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Trilu", kOnnxDomain),
+          {
+              {14, {1}},  // ignore k (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("QuickGelu", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain),
+          {
+              {1, {}},
+              {11, {}},
+              {13, {1}},  // ignore the axes (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Where", kOnnxDomain),
+          {
+              {9, {}},
+              {16, {}},
+          },
+      },
+
+  });
+
+  constexpr const int advanced_op_level = static_cast<int>(ProbeLevel::Advanced);
+  recomputable_op_table_map.insert({advanced_op_level, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>()});
+  auto& advanced_recomputable_op_table = recomputable_op_table_map.at(advanced_op_level);
+  // Append basic_recomputable_op_table to advanced_recomputable_op_table.
+  advanced_recomputable_op_table.insert(recomputable_op_table_map.at(basic_op_level).begin(),
+                                        recomputable_op_table_map.at(basic_op_level).end());
+
+  advanced_recomputable_op_table.insert({
+      {
+          utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain),
+          {
+              {1, {2}},  // ignore ratio (optional)
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
+          {
+              // Opset 1 in ONNX official does not have LayerNormalization,
+              // while our contrib op defined LayerNormalization in opset 1 in ONNX domain.
+              {1, {}},
+              {17, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
+          {
+              {1, {}},
+              {9, {}},
+              {13, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain),
+          {
+              // Opset 1 in ONNX official does not have SimplifiedLayerNormalization,
+              // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain.
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain),
+          {
+              {1, {}},
+          },
+      },
+      {
+          utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
+          {
+              {1, {}},
+              {11, {}},
+              {13, {}},
+          },
+      },
+  });
+
+  return recomputable_op_table_map;
+}
 
-  recomputable_op_table_map.insert({probe_op_level, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>()});
-  auto& recomputable_op_table = recomputable_op_table_map.at(probe_op_level);
-  if (probe_op_level >= static_cast<int>(ProbeLevel::Basic)) {
-    recomputable_op_table.insert({
-        {
-            utils::GetFullQualifiedOpName("Add", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {13, {}},
-                {14, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {9, {}},
-                {14, {}},
-                {15, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BiasDropout", kMSDomain),
-            {
-                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain),
-            {
-                {1, {3, 4}},  // ignore ratio (optional) and training mode (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain),
-            {
-                {1, {1, 2}},  // ignore ratio (optional) and training mode (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {9, {}},
-                {13, {}},
-                {19, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain),
-            {
-                {1, {}},
-
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain),
-            {
-                {9, {0}},  // ignore the `input`, e.g. the shape of the expected output tensor
-                {20, {0}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Cos", kOnnxDomain),
-            {
-                {7, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("CumSum", kOnnxDomain),
-            {
-                // The axis input is trivial
-                {11, {1}},
-                {14, {1}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
-            {
-                // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support.
-                {12, {1, 2}},  // ignore ratio and training_mode
-                {13, {1, 2}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Div", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {13, {}},
-                {14, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Einsum", kOnnxDomain),
-            {
-                {12, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Equal", kOnnxDomain),
-            {
-                {1, {}},
-                {7, {}},
-                {11, {}},
-                {13, {}},
-                {19, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Expand", kOnnxDomain),
-            {
-                {8, {1}},  // Ignore the shape.
-                {13, {1}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("FastGelu", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("FlattenAndUnpad", kMSDomain),
-            {
-                {1, {1}},  // ignore the indices
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Gather", kOnnxDomain),
-            {
-                {1, {1}},  // ignore the indices
-                {11, {1}},
-                {13, {1}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Gelu", kOnnxDomain),
-            {
-                {20, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Gelu", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Gemm", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {9, {}},
-                {11, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Less", kOnnxDomain),
-            {
-                {1, {}},
-                {7, {}},
-                {9, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("MemcpyFromHost", kOnnxDomain),
-            {
-                {1, {0}},  // Ignore CPU input.
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Mul", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {13, {}},
-                {14, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Neg", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("NonZero", kOnnxDomain),
-            {
-                {9, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("PadAndUnflatten", kMSDomain),
-            {
-                {1, {1, 2}},  // ignore the indices and unflatten_dims
-            },
-        },
-        {
-            // Be noted, NOT all PythonOp will be allowed to recompute, there will be further check.
-            utils::GetFullQualifiedOpName("PythonOp", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Range", kOnnxDomain),
-            {
-                {11, {0, 1, 2}},  // ignore start, end, delta, because they are scalars.
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
-            {
-                {1, {}},
-                {5, {}},  // ignore the shape.
-                {13, {}},
-                {14, {}},
-                {19, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Sin", kOnnxDomain),
-            {
-                {7, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Slice", kOnnxDomain),
-            {
-                {1, {}},
-                {10, {1, 2, 3, 4}},  // ignore starts, ends, axes (optional) and steps (optional)
-                {11, {1, 2, 3, 4}},
-                {13, {1, 2, 3, 4}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Split", kOnnxDomain),
-            {
-                {1, {1}},  // ignore split (optional)
-                {2, {}},
-                {11, {}},
-                {13, {1}},  // ignore the split (optional)
-                {18, {1}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain),
-            {
-                {1, {}},
-                {11, {}},
-                {13, {1}},  // ignore the axes (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Sub", kOnnxDomain),
-            {
-                {1, {}},
-                {6, {}},
-                {7, {}},
-                {13, {}},
-                {14, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Tile", kOnnxDomain),
-            {
-                {1, {1, 2}},
-                {6, {1}},
-                {13, {1}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
-            {
-                {1, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Trilu", kOnnxDomain),
-            {
-                {14, {1}},  // ignore k (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("QuickGelu", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain),
-            {
-                {1, {}},
-                {11, {}},
-                {13, {1}},  // ignore the axes (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Where", kOnnxDomain),
-            {
-                {9, {}},
-                {16, {}},
-            },
-        },
-
-    });
-  }
+const InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>& GetAllowedRecomputeOps(int probe_op_level) {
+  static InlinedHashMap<int, InlinedHashMap<std::string, OpsetToIgnorableIndicesMap>>
+      recomputable_op_table_map = InitializeRecomputableOpTable();
 
-  if (probe_op_level >= static_cast<int>(ProbeLevel::Advanced)) {
-    recomputable_op_table.insert({
-        {
-            utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain),
-            {
-                {1, {2}},  // ignore ratio (optional)
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
-            {
-                // Opset 1 in ONNX official does not have LayerNormalization,
-                // while our contrib op defined LayerNormalization in opset 1 in ONNX domain.
-                {1, {}},
-                {17, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
-            {
-                {1, {}},
-                {9, {}},
-                {13, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain),
-            {
-                // Opset 1 in ONNX official does not have SimplifiedLayerNormalization,
-                // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain.
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain),
-            {
-                {1, {}},
-            },
-        },
-        {
-            utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
-            {
-                {1, {}},
-                {11, {}},
-                {13, {}},
-            },
-        },
-    });
-  }
+  ORT_ENFORCE(recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end(),
+              "Cannot get recomputable op table, probe level: ", probe_op_level);
 
-  return recomputable_op_table;
+  return recomputable_op_table_map.at(probe_op_level);
 }
 
 /**
diff --git a/orttraining/orttraining/test/training_api/core/training_api_tests.cc b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
index 90c97eed0c6d3..be25eefb201da 100644
--- a/orttraining/orttraining/test/training_api/core/training_api_tests.cc
+++ b/orttraining/orttraining/test/training_api/core/training_api_tests.cc
@@ -542,8 +542,9 @@ TEST(TrainingApiTest, OptimStep) {
   std::string param_name = "fc2.weight";
   // before training, check if optim state is initialized to 0
   onnxruntime::training::api::OptimizerCheckpointState& optimizer_states = state.optimizer_checkpoint_state;
+  std::shared_ptr<onnxruntime::training::api::GroupOptimizerState> group0_states = optimizer_states.group_named_optimizer_states["group0"];
   onnxruntime::training::api::ParameterOptimizerState& param_state =
-      optimizer_states.group_named_optimizer_states["group0"]->param_named_optimizer_states.at(param_name);
+      group0_states->param_named_optimizer_states.at(param_name);
   OrtValue& moment_1 = param_state.at("momentum0");
 
   std::vector<float> param_vec_before_optimizer_step;
diff --git a/orttraining/orttraining/training_api/checkpoint.cc b/orttraining/orttraining/training_api/checkpoint.cc
index 56029b34c24d7..cbff1891b8c84 100644
--- a/orttraining/orttraining/training_api/checkpoint.cc
+++ b/orttraining/orttraining/training_api/checkpoint.cc
@@ -449,7 +449,7 @@ Status FromOptimizerState(const OptimizerCheckpointState& optimizer_state,
 
   fbs_optimizer_groups.reserve(optimizer_state.group_named_optimizer_states.size());
   for (const auto& group_name : SortedKeys(optimizer_state.group_named_optimizer_states)) {
-    const std::shared_ptr<GroupOptimizerState>& group_optimizer_state_ptr =
+    std::shared_ptr<GroupOptimizerState> group_optimizer_state_ptr =
         optimizer_state.group_named_optimizer_states.at(group_name);
 
     std::vector<flatbuffers::Offset<fbs::ParameterOptimizerState>> optimizer_states;

From ae3ec2e9ac1f1a1dde23407051d409fc8b52e639 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 24 Jul 2024 17:48:22 -0700
Subject: [PATCH 14/57] Ignore ruff rule `N813` (#21477)

Allow importing camelcase names in lowercase
---
 .../test/python/onnxruntime_test_python_backend_mlops.py       | 3 +--
 pyproject.toml                                                 | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index b5400b487cfc2..c245699e211d4 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-# -*- coding: UTF-8 -*-
 import unittest
 
 import numpy as np
@@ -10,7 +9,7 @@
 
 import onnxruntime.backend as backend
 from onnxruntime import datasets
-from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend  # noqa: N813
+from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend
 
 
 def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
diff --git a/pyproject.toml b/pyproject.toml
index 1c3a719fb544a..6429df2722b2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,6 +77,7 @@ ignore = [
     "G004", # FIXME: Enable when the rule can be autofixed
     "N803", # Argument casing
     "N812", # Allow import torch.nn.functional as F
+    "N813", # Allow importing camelcase names in lowercase
     "N999", # Module names
     "NPY002", # np.random.Generator may not always fit our use cases
     "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives

From ca47f0fdd33ab267d8066edd6441ac4090bbe4aa Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Wed, 24 Jul 2024 23:45:31 -0700
Subject: [PATCH 15/57] OVEP - PR 1.19 (#21443)

### Description
Add OVEP  features for 1.19

The PR has,
- Added support for EpCtx with ORT Session options for optimized
performance.
- Added bug fixes
- Support for OV 2024.3

---------

Co-authored-by: ubuntu <ubuntu@ubuntu-mtlp-118727.iind.intel.com>
Co-authored-by: vthaniel <vishnudas.thaniel.s@intel.com>
Co-authored-by: sfatimar <sahar.fatima@intel.com>
Co-authored-by: saurabhkale17 <saurabh1.kale@intel.com>
Co-authored-by: Maheshkar <ankit.maheshkar@intel.com>
---
 cmake/onnxruntime_providers_openvino.cmake    |  4 +-
 docs/python/ReadMeOV.rst                      |  8 ++-
 .../providers/openvino/backend_manager.cc     | 45 ++++++++++-----
 .../openvino/backends/basic_backend.cc        | 28 ++++++----
 .../openvino/backends/basic_backend.h         |  2 +-
 .../openvino/onnx_ctx_model_helper.cc         | 14 ++---
 .../openvino/onnx_ctx_model_helper.h          |  3 +-
 .../openvino/openvino_execution_provider.cc   |  3 +-
 .../openvino/openvino_execution_provider.h    | 55 ++++++++++++++-----
 .../openvino/openvino_provider_factory.cc     | 50 ++++++++++++++---
 .../core/providers/openvino/ov_interface.cc   | 43 +++++++--------
 .../core/providers/openvino/ov_interface.h    | 21 ++++---
 .../openvino/ov_versions/capability.cc        | 14 ++---
 .../openvino/ov_versions/data_ops.cc          | 10 ++--
 .../providers/openvino/ov_versions/data_ops.h |  4 +-
 .../qdq_transformations/qdq_stripping.cc      | 36 +++++++++---
 .../core/session/provider_bridge_ort.cc       | 21 ++++++-
 .../test/perftest/command_args_parser.cc      |  1 -
 onnxruntime/test/perftest/ort_test_session.cc |  4 ++
 onnxruntime/test/providers/checkers.cc        | 20 ++++++-
 .../providers/cpu/rnn/deep_cpu_gru_op_test.cc |  6 +-
 21 files changed, 271 insertions(+), 121 deletions(-)

diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index d738e29101cfe..5d1a481d40abc 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -17,8 +17,8 @@
 
   # Header paths
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  if(OpenVINO_VERSION VERSION_LESS 2023.0)
-    message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release")
+  if(OpenVINO_VERSION VERSION_LESS 2024.0)
+    message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
   if (WIN32)
diff --git a/docs/python/ReadMeOV.rst b/docs/python/ReadMeOV.rst
index 6ef16e1378139..86914699bbf6d 100644
--- a/docs/python/ReadMeOV.rst
+++ b/docs/python/ReadMeOV.rst
@@ -7,6 +7,7 @@ OpenVINO™ Execution Provider for ONNX Runtime accelerates inference across man
  - Intel® CPUs
  - Intel® integrated GPUs
  - Intel® discrete GPUs
+ - Intel® integrated NPUs (Windows only)
 
 Installation
 ------------
@@ -15,26 +16,27 @@ Requirements
 ^^^^^^^^^^^^
 
 - Ubuntu 18.04, 20.04, RHEL(CPU only) or Windows 10 - 64 bit
-- Python 3.8 or 3.9 or 3.10 for Linux and only Python3.10 for Windows
+- Python 3.9 or 3.10 or 3.11 for Linux and Python 3.10, 3.11 for Windows
 
 This package supports:
  - Intel® CPUs
  - Intel® integrated GPUs
  - Intel® discrete GPUs
+ - Intel® integrated NPUs (Windows only)
 
 ``pip3 install onnxruntime-openvino``
 
 Please install OpenVINO™ PyPi Package separately for Windows.
 For installation instructions on Windows please refer to  `OpenVINO™ Execution Provider for ONNX Runtime for Windows <https://github.com/intel/onnxruntime/releases/>`_.
 
-**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2023.0.0 eliminating the need to install OpenVINO™ separately. The OpenVINO™ libraries are prebuilt with CXX11_ABI flag set to 0.
+**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2024.1.0 eliminating the need to install OpenVINO™ separately.
 
 For more details on build and installation please refer to `Build <https://onnxruntime.ai/docs/build/eps.html#openvino>`_.
 
 Usage
 ^^^^^
 
-By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated or discrete GPU.
+By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated GPU, discrete GPU, integrated NPU (Windows only).
 Invoke `the provider config device type argument <https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options>`_ to change the hardware on which inferencing is done.
 
 For more API calls and environment variables, see  `Usage <https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#configuration-options>`_.
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 1c027e39fa5f5..8f3658df0d09d 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -28,9 +28,8 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger,
-                               EPCtxHandler& ctx_handle) {
+                               EPCtxHandler& ep_ctx_handle_) {
   global_context_ = global_context;
-  ep_ctx_handle_ = ctx_handle;
 
   openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." +
                           std::to_string(global_context_.OpenVINO_Version.at(1));
@@ -147,13 +146,20 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
 
   std::string model_blob_str;
   auto compiled_model = concrete_backend_->GetOVCompiledModel();
-  auto graph_name = global_context_.onnx_model_path_name;
-  // Remove extension so we can append suffix to form the complete name of output graph
-  graph_name = [&]() {
-    size_t dot = graph_name.find_last_of(".");
-    if (dot == std::string::npos) return graph_name;
-    return graph_name.substr(0, dot);
-  }();
+  std::string graph_name = "";
+  // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability
+  if (global_context_.cache_dir != "") {
+    graph_name = global_context_.cache_dir;
+  } else {
+    graph_name = global_context_.onnx_model_path_name;
+    // Remove extension so we can append suffix to form the complete name of output graph
+    graph_name = [&]() {
+      size_t dot = graph_name.find_last_of(".");
+      if (dot == std::string::npos) return graph_name;
+      return graph_name.substr(0, dot);
+    }();
+    graph_name = graph_name + "-ov_" + GetGlobalContext().device_type + "_blob.onnx";
+  }
   // If embed_mode, then pass on the serialized blob
   // If not embed_mode, dump the blob here and only pass on the path to the blob
   if (global_context_.ep_context_embed_mode) {
@@ -162,9 +168,19 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     model_blob_str = model_blob_stream.str();
     ORT_ENFORCE(model_blob_str.size() != 0);
   } else {
-    std::ofstream f(graph_name + ".blob", std::ios::out | std::ios::trunc | std::ios::binary);
-    compiled_model.export_model(f);
-    model_blob_str = graph_name + ".blob";
+    // Remove extension so we can append suffix to form the complete name of output graph
+    auto blob_name = [&]() {
+      size_t dot = graph_name.find_last_of(".");
+      if (dot == std::string::npos) return graph_name;
+      return graph_name.substr(0, dot);
+    }();
+    std::ofstream blob_file(blob_name + ".blob",
+                            std::ios::out | std::ios::trunc | std::ios::binary);
+    if (!blob_file) {
+      ORT_THROW("Unable to open file for epctx model dump.");
+    }
+    compiled_model.export_model(blob_file);
+    model_blob_str = blob_name + ".blob";
   }
 
   ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer,
@@ -172,8 +188,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
                                                       logger,
                                                       global_context_.ep_context_embed_mode,
                                                       model_blob_str,
-                                                      openvino_sdk_version_,
-                                                      GetGlobalContext().device_type));
+                                                      openvino_sdk_version_));
 
   return Status::OK();
 }
@@ -248,7 +263,7 @@ static void DumpOpenVINOEPModel(std::string onnx_model_path_name,
                                 ONNX_NAMESPACE::ModelProto* model_proto,
                                 const onnxruntime::Node& fused_node) {
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
-    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name;
+    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name);
 #ifdef _WIN32
     size_t slash = model_name.find_last_of("\\");
 #else
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index f8046bcb3a06f..d79aa35be6418 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -37,7 +37,7 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
   PopulateConfigValue(device_config);
 
   // Enable caching
-  EnableCaching();
+  EnableCaching(device_config);
 
   // Setting OpenCL queue throttling for GPU
   EnableGPUThrottling(device_config);
@@ -82,26 +82,28 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       }
 #else  // !IO_BUFFER_ENABLED
+      std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision;
       if (is_ep_ctx_graph_) {
         // If the blob is held in an EPContext node, then skip FE+Compile
         // and directly move on to creating a backend with the executable blob
         exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(),
                                                            hw_target,
                                                            device_config,
+                                                           global_context_.ep_context_embed_mode,
                                                            subgraph_context_.subgraph_name);
         ie_cnn_network_ = exe_network_.Get().get_runtime_model();
-      } else if (!subgraph_context_.has_dynamic_input_shape) {
+      } else if ((!subgraph_context_.has_dynamic_input_shape) &&
+                 ((hw_target.find("AUTO") == std::string::npos) ||
+                  (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) {
+        // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above
         // Inputs with static dimenstions
-        std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision;
         const std::string model = model_proto.SerializeAsString();
         exe_network_ = global_context_.ie_core.CompileModel(model,
                                                             hw_target,
-                                                            prec_str,
-                                                            global_context_.cache_dir,
                                                             device_config,
                                                             subgraph_context_.subgraph_name);
         ie_cnn_network_ = exe_network_.Get().get_runtime_model();
-      } else {  // Inputs with dynamic dimensions
+      } else {  // For all other types use ov::Model Type
         ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
         exe_network_ = global_context_.ie_core.CompileModel(
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
@@ -173,13 +175,19 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   }
 }
 
-void BasicBackend::EnableCaching() {
+void BasicBackend::EnableCaching(ov::AnyMap& device_config) {
   // cache_dir argument has no effect when working with an embed-mode EPContext Graph
   if (is_ep_ctx_graph_) return;
 
-  if (!global_context_.cache_dir.empty()) {
+  if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) {
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
-    global_context_.ie_core.SetCache(global_context_.cache_dir, global_context_.device_type);
+    if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) {
+      std::pair<std::string, ov::Any> device_property;
+      device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir);
+      device_config.emplace(ov::device::properties("GPU", device_property));
+    } else {
+      global_context_.ie_core.SetCache(global_context_.cache_dir);
+    }
   }
 }
 
@@ -274,7 +282,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         }
 
         try {
-          infer_request->SetTensor(input_name, tensor_ptr);
+          infer_request->SetTensor(std::move(input_name), tensor_ptr);
         } catch (const char* msg) {
           ORT_THROW(msg);
         }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index 5565223f067b8..bcd3161590ba0 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -37,7 +37,7 @@ class BasicBackend : public IBackend {
   void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&);
   bool ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
   void PopulateConfigValue(ov::AnyMap& device_config);
-  void EnableCaching();
+  void EnableCaching(ov::AnyMap& device_config);
   void EnableGPUThrottling(ov::AnyMap& device_config);
   void EnableStreams();
   void SetNumThreads(ov::AnyMap& device_config);
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index cd1ae6150e1da..e2df9c83f15ae 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -19,8 +19,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
                                       const logging::Logger& logger,
                                       const bool& ep_context_embed_mode,
                                       const std::string& model_blob_str,
-                                      const std::string& openvino_sdk_version,
-                                      const std::string& device_type) const {
+                                      const std::string& openvino_sdk_version) const {
   auto model_build = graph_viewer.CreateModel(logger);
   auto& graph_build = model_build->MainGraph();
 
@@ -77,9 +76,12 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer,
   model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
 
   // Finally, dump the model
-  std::ofstream dump(graph_name + "-ov_" + device_type + "_blob.onnx",
-                     std::ios::out | std::ios::trunc | std::ios::binary);
-  model_proto->SerializeToOstream(dump);
+  std::ofstream epctx_onnx_model(graph_name,
+                                 std::ios::out | std::ios::trunc | std::ios::binary);
+  if (!epctx_onnx_model) {
+    ORT_THROW("Unable to create epctx onnx model file ");
+  }
+  model_proto->SerializeToOstream(epctx_onnx_model);
 
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node";
 
@@ -90,9 +92,7 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer) {
   auto node = graph_viewer.GetNode(0);
   auto& attrs = node->GetAttributes();
   ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0);
-
   model_stream_ = std::make_shared<std::istringstream>(attrs.at(EP_CACHE_CONTEXT).s());
-
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
 
   is_valid_ep_ctx_graph_ = true;
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index b2b9b5bc53d44..610e9fd49c901 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -29,8 +29,7 @@ class EPCtxHandler {
                           const logging::Logger& logger,
                           const bool& ep_context_embed_mode,
                           const std::string& model_blob_str,
-                          const std::string& openvino_sdk_version,
-                          const std::string& device_type) const;
+                          const std::string& openvino_sdk_version) const;
   Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer);
   bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const;
   bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; }
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 655e1b180388b..5627cb2c122fb 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -34,6 +34,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
   global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
   global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
   global_context_->disable_cpu_fallback = info.disable_cpu_fallback_;
+  global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_;
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
@@ -47,7 +48,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
           info.device_type_.find("AUTO") != std::string::npos) {
         device_found = true;
       } else {
-        for (std::string device : available_devices) {
+        for (const std::string& device : available_devices) {
           if (device.rfind(info.device_type_, 0) == 0) {
             if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" ||
                                                                        info.precision_ == "FP16" ||
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 050fb91c51771..030e5bba71b67 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -16,16 +16,23 @@
 
 namespace onnxruntime {
 
+struct OVDevices {
+  ov::Core core;
+  std::vector<std::string> get_ov_devices() const {
+    return core.get_available_devices();
+  }
+};
+
 static void print_build_options() {
   std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl;
   std::cout << "Specify the keyword HETERO (or) MULTI (or) AUTO followed by the devices in the order of priority "
             << "you want to build"
             << std::endl;
   std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build "
-            << "are ['CPU','GPU','NPU']"
+            << "are ['CPU','GPU','NPU','GPU.x'] where x = 0,1,2 and so on"
             << std::endl;
   std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. "
-            << "Ex: HETERO:GPU,CPU  Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU"
+            << "Ex: HETERO:GPU,CPU  Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU Ex: AUTO:GPU.0,CPU Ex: AUTO:GPU.1,CPU"
             << std::endl;
 }
 
@@ -40,7 +47,8 @@ static std::vector<std::string> split(const std::string& s, char delim) {
   return result;
 }
 
-static std::vector<std::string> parseDevices(const std::string& device_string) {
+static std::vector<std::string> parseDevices(const std::string& device_string,
+                                             const std::vector<std::string>& available_devices) {
   std::string comma_separated_devices = device_string;
   if (comma_separated_devices.find(":") != std::string::npos) {
     comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
@@ -50,8 +58,15 @@ static std::vector<std::string> parseDevices(const std::string& device_string) {
     print_build_options();
     ORT_THROW("Invalid device string: " + device_string);
   }
-  std::vector<std::string> dev_options = {"CPU", "GPU", "NPU"};
-  for (std::string dev : devices) {
+  std::set<std::string> dev_options = {"CPU", "GPU", "NPU"};
+
+  for (auto& device : available_devices) {
+    if (dev_options.find(device) == dev_options.end()) {
+      auto dev_options_update = dev_options.emplace(device);
+    }
+  }
+
+  for (const std::string& dev : devices) {
     if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
       print_build_options();
       ORT_THROW("Invalid device string: " + device_string);
@@ -75,28 +90,42 @@ struct OpenVINOExecutionProviderInfo {
   bool export_ep_ctx_blob_{false};
   bool enable_qdq_optimizer_{false};
   bool disable_cpu_fallback_{false};
+  bool so_epctx_embed_mode_{true};
 
   OpenVINOExecutionProviderInfo() = delete;
 
-  explicit OpenVINOExecutionProviderInfo(std::string dev_type, std::string precision, bool enable_npu_fast_compile,
-                                         size_t num_of_threads, std::string cache_dir, std::string model_priority,
+  explicit OpenVINOExecutionProviderInfo(const std::string& dev_type, const std::string& precision,
+                                         bool enable_npu_fast_compile, size_t num_of_threads,
+                                         const std::string& cache_dir, const std::string& model_priority,
                                          int num_streams, void* context, bool enable_opencl_throttling,
                                          bool disable_dynamic_shapes, bool export_ep_ctx_blob,
-                                         bool enable_qdq_optimizer, bool disable_cpu_fallback)
-      : precision_(precision),
+                                         bool enable_qdq_optimizer, bool disable_cpu_fallback,
+                                         bool so_epctx_embed_mode)
+      : precision_(std::move(precision)),
         enable_npu_fast_compile_(enable_npu_fast_compile),
         num_of_threads_(num_of_threads),
         cache_dir_(std::move(cache_dir)),
-        model_priority_(model_priority),
+        model_priority_(std::move(model_priority)),
         num_streams_(num_streams),
         context_(context),
         enable_opencl_throttling_(enable_opencl_throttling),
         disable_dynamic_shapes_(disable_dynamic_shapes),
         export_ep_ctx_blob_(export_ep_ctx_blob),
         enable_qdq_optimizer_(enable_qdq_optimizer),
-        disable_cpu_fallback_(disable_cpu_fallback) {
+        disable_cpu_fallback_(disable_cpu_fallback),
+        so_epctx_embed_mode_{so_epctx_embed_mode} {
     std::set<std::string> ov_supported_device_types = {"CPU", "GPU",
                                                        "GPU.0", "GPU.1", "NPU"};
+
+    OVDevices devices;
+    std::vector<std::string> available_devices = devices.get_ov_devices();
+
+    for (auto& device : available_devices) {
+      if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
+        ov_supported_device_types.emplace(device);
+      }
+    }
+
     if (dev_type == "") {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP]"
                          << "No runtime device selection option provided.";
@@ -116,7 +145,7 @@ struct OpenVINOExecutionProviderInfo {
       dev_type = DEVICE;
 
       if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-        std::vector<std::string> devices = parseDevices(dev_type);
+        std::vector<std::string> devices = parseDevices(dev_type, available_devices);
         precision_ = "FP16";
         if (devices[0] == "CPU") {
           precision_ = "FP32";
@@ -127,7 +156,7 @@ struct OpenVINOExecutionProviderInfo {
     } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) {
       device_type_ = std::move(dev_type);
     } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) {
-      std::vector<std::string> devices = parseDevices(dev_type);
+      std::vector<std::string> devices = parseDevices(dev_type, available_devices);
       device_type_ = dev_type;
     } else {
       ORT_THROW("Invalid device string: " + dev_type);
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 45bba431741c5..716a7cd936405 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -14,7 +14,8 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
                           int num_streams, void* context,
                           bool enable_opencl_throttling, bool disable_dynamic_shapes,
                           bool export_ep_ctx_blob, bool enable_qdq_optimizer,
-                          bool disable_cpu_fallback)
+                          bool disable_cpu_fallback,
+                          bool so_epctx_embed_mode)
       : precision_(precision),
         enable_npu_fast_compile_(enable_npu_fast_compile),
         num_of_threads_(num_of_threads),
@@ -25,10 +26,12 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
         disable_dynamic_shapes_(disable_dynamic_shapes),
         export_ep_ctx_blob_(export_ep_ctx_blob),
         enable_qdq_optimizer_(enable_qdq_optimizer),
-        disable_cpu_fallback_(disable_cpu_fallback) {
+        disable_cpu_fallback_(disable_cpu_fallback),
+        so_epctx_embed_mode_(so_epctx_embed_mode) {
     device_type_ = (device_type == nullptr) ? "" : device_type;
     cache_dir_ = (cache_dir == nullptr) ? "" : cache_dir;
   }
+
   ~OpenVINOProviderFactory() override {
   }
 
@@ -48,13 +51,15 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory {
   bool export_ep_ctx_blob_;
   bool enable_qdq_optimizer_;
   bool disable_cpu_fallback_;
+  bool so_epctx_embed_mode_;
 };
 
 std::unique_ptr<IExecutionProvider> OpenVINOProviderFactory::CreateProvider() {
   OpenVINOExecutionProviderInfo info(device_type_, precision_, enable_npu_fast_compile_, num_of_threads_,
                                      cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_,
                                      disable_dynamic_shapes_, export_ep_ctx_blob_, enable_qdq_optimizer_,
-                                     disable_cpu_fallback_);
+                                     disable_cpu_fallback_,
+                                     so_epctx_embed_mode_);
   return std::make_unique<OpenVINOExecutionProvider>(info);
 }
 
@@ -105,6 +110,8 @@ struct OpenVINO_Provider : Provider {
 
     bool disable_cpu_fallback = false;
 
+    bool so_epctx_embed_mode = true;
+
     if (provider_options_map.find("device_type") != provider_options_map.end()) {
       device_type = provider_options_map.at("device_type").c_str();
 
@@ -113,6 +120,14 @@ struct OpenVINO_Provider : Provider {
       std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
                                                        "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
                                                        "GPU.0_FP16", "GPU.1_FP16"};
+      OVDevices devices;
+      std::vector<std::string> available_devices = devices.get_ov_devices();
+
+      for (auto& device : available_devices) {
+        if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) {
+          ov_supported_device_types.emplace(device);
+        }
+      }
       if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) {
         std::string deprecated_device = device_type;
         int delimit = device_type.find("_");
@@ -128,8 +143,8 @@ struct OpenVINO_Provider : Provider {
             (device_type.find("MULTI:") == 0) ||
             (device_type.find("AUTO:") == 0))) {
         ORT_THROW(
-            "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. "
-            "Select from 'CPU', 'GPU', 'GPU.0', 'GPU.1', 'NPU' or from"
+            "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. "
+            "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from"
             " HETERO/MULTI/AUTO options available. \n");
       }
     }
@@ -253,9 +268,8 @@ struct OpenVINO_Provider : Provider {
         }
       }
     }
-
-    if (provider_options_map.find("export_ep_ctx_blob") != provider_options_map.end()) {
-      bool_flag = provider_options_map.at("export_ep_ctx_blob");
+    if (provider_options_map.find("so_export_ep_ctx_blob") != provider_options_map.end()) {
+      bool_flag = provider_options_map.at("so_export_ep_ctx_blob");
       if (bool_flag == "true" || bool_flag == "True")
         export_ep_ctx_blob = true;
       else if (bool_flag == "false" || bool_flag == "False")
@@ -271,6 +285,23 @@ struct OpenVINO_Provider : Provider {
         disable_cpu_fallback = false;
       bool_flag = "";
     }
+    if (provider_options_map.find("so_epctx_embed_mode") != provider_options_map.end()) {
+      bool_flag = provider_options_map.at("so_epctx_embed_mode");
+      if (bool_flag == "true" || bool_flag == "True")
+        so_epctx_embed_mode = true;
+      else if (bool_flag == "false" || bool_flag == "False")
+        so_epctx_embed_mode = false;
+      bool_flag = "";
+    }
+
+    if (provider_options_map.find("so_epctx_path") != provider_options_map.end()) {
+      // The path to dump epctx model is valid only when epctx is enabled.
+      // Overrides the cache_dir option to dump model cache files from OV.
+      if (export_ep_ctx_blob) {
+        cache_dir = provider_options_map.at("so_epctx_path").c_str();
+      }
+    }
+
     return std::make_shared<OpenVINOProviderFactory>(const_cast<char*>(device_type.c_str()),
                                                      const_cast<char*>(precision.c_str()),
                                                      enable_npu_fast_compile,
@@ -283,7 +314,8 @@ struct OpenVINO_Provider : Provider {
                                                      disable_dynamic_shapes,
                                                      export_ep_ctx_blob,
                                                      enable_qdq_optimizer,
-                                                     disable_cpu_fallback);
+                                                     disable_cpu_fallback,
+                                                     so_epctx_embed_mode);
   }
 
   void Initialize() override {
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 8dd00857b7dd0..7e8681d304abf 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -63,7 +63,6 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std
       return FE->convert(inputModel);
     } else {
       ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
-      return NULL;
     }
   } catch (const Exception& e) {
     ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
@@ -73,9 +72,9 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std
 }
 
 OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                                  std::string hw_target,
-                                  const ov::AnyMap& device_config,
-                                  std::string name) {
+                                  std::string& hw_target,
+                                  ov::AnyMap& device_config,
+                                  const std::string& name) {
   ov::CompiledModel obj;
   try {
     obj = oe.compile_model(ie_cnn_network, hw_target, device_config);
@@ -92,22 +91,12 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
 }
 
 OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
-                                  std::string hw_target,
-                                  std::string precision,
-                                  std::string cache_dir,
-                                  const ov::AnyMap& device_config,
-                                  std::string name) {
+                                  std::string& hw_target,
+                                  ov::AnyMap& device_config,
+                                  const std::string& name) {
   ov::CompiledModel obj;
   try {
-    if (hw_target == "AUTO:GPU,CPU") {
-      obj = oe.compile_model(onnx_model, ov::Tensor(),
-                             "AUTO",
-                             ov::device::priorities("GPU", "CPU"),
-                             ov::device::properties("GPU", {ov::cache_dir(cache_dir),
-                                                            ov::hint::inference_precision(precision)}));
-    } else {
-      obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
-    }
+    obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config);
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -123,9 +112,19 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
 OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_stream,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
+                                 bool embed_mode,
                                  std::string name) {
   try {
-    auto obj = oe.import_model(*model_stream, hw_target, device_config);
+    ov::CompiledModel obj;
+    if (embed_mode) {
+      obj = oe.import_model(*model_stream, hw_target, device_config);
+    } else {
+      std::string blob_file_path = (*model_stream).str();
+      std::ifstream modelStream(blob_file_path, std::ios_base::binary | std::ios_base::in);
+      obj = oe.import_model(modelStream,
+                            hw_target,
+                            {});
+    }
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -138,10 +137,8 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr<std::istringstream> model_strea
   }
 }
 
-void OVCore::SetCache(std::string cache_dir_path, std::string device_type) {
-  if (device_type != "AUTO:GPU,CPU") {
-    oe.set_property(ov::cache_dir(cache_dir_path));
-  }
+void OVCore::SetCache(const std::string& cache_dir_path) {
+  oe.set_property(ov::cache_dir(cache_dir_path));
 }
 
 #ifdef IO_BUFFER_ENABLED
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index af6f252feb2ce..fa22e0f3cb03d 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -40,20 +40,23 @@ class OVCore {
   ov::Core oe;
 
  public:
+  // OV Interface For Reading Model
   std::shared_ptr<OVNetwork> ReadModel(const std::string& model_stream, const std::string& model_path) const;
+  // OV Interface for Compiling OV Model Type
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_network,
-                            std::string hw_target,
-                            const ov::AnyMap& device_config,
-                            std::string name);
+                            std::string& hw_target,
+                            ov::AnyMap& device_config,
+                            const std::string& name);
+  // OV Interface for Fast Compile
   OVExeNetwork CompileModel(const std::string& onnx_model,
-                            std::string hw_target,
-                            std::string precision,
-                            std::string cache_dir,
-                            const ov::AnyMap& device_config,
-                            std::string name);
+                            std::string& hw_target,
+                            ov::AnyMap& device_config,
+                            const std::string& name);
+  // OV Interface for Import model Stream
   OVExeNetwork ImportModel(std::shared_ptr<std::istringstream> model_stream,
                            std::string hw_target,
                            const ov::AnyMap& device_config,
+                           bool embed_mode,
                            std::string name);
 #ifdef IO_BUFFER_ENABLED
   OVExeNetwork CompileModel(std::shared_ptr<const OVNetwork>& model,
@@ -64,7 +67,7 @@ class OVCore {
                            std::string name);
 #endif
   std::vector<std::string> GetAvailableDevices();
-  void SetCache(std::string cache_dir_path, std::string device_type);
+  void SetCache(const std::string& cache_dir_path);
   ov::Core& Get() { return oe; }
   void SetStreams(const std::string& device_type, int num_streams);
 };
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 856b97a0896db..3fcaff4369c89 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -35,18 +35,16 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
     device_type_ = "CPU";
     if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true;
   }
-#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1
-  data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2
-  data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3
-  data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0
+#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0
   data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 1
   data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 2
+  data_ops_ = new DataOps(graph_viewer_, V_2024_2, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3
+  data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index 38c029faff9d5..d9aa13ec1bba9 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -142,6 +142,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}},
     {"GridSample", V_2022_3, {"CPU"}},
     {"GridSample", V_2023_0, {"GPU"}},
+    {"GRU", V_2024_1, {"CPU", "GPU"}},
     {"HardMax", V_2023_1, {"CPU", "GPU"}},
     {"Identity", V_2020_4, {"CPU", "GPU"}},
     {"If", V_2022_3, {"CPU", "GPU"}},
@@ -155,6 +156,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"LessOrEqual", V_2022_1, {"CPU", "GPU"}},
     {"Log", V_2020_4, {"CPU", "GPU"}},
     {"LogSoftMax", V_2022_1, {"CPU", "GPU"}},
+    {"LogSoftmax", V_2024_1, {"CPU", "GPU"}},
     {"Loop", V_2021_4, {"CPU", "GPU"}},
     {"LpNormalization", V_2023_1, {"CPU", "GPU"}},
     {"LRN", V_2020_4, {"CPU", "GPU"}},
@@ -361,7 +363,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -376,7 +378,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"ReduceMax", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -393,7 +395,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Reshape", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -408,7 +410,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 7cfb0516b8ccf..4c064b08405c1 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -28,7 +28,9 @@ enum versionNum {
   V_2023_2,
   V_2023_3,
   V_2024_0,
-  V_2024_1
+  V_2024_1,
+  V_2024_2,
+  V_2024_3
 };
 
 using VersionNum = enum versionNum;
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index c7689a0be7e73..a2b3ed068235b 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -205,11 +205,11 @@ static bool IsConnectedQAConstantInitializer(const Node* dq_node, const onnxrunt
 
 // Check required because in some cases, when a NodeUnit cannot be formed with this standalone DQ
 // we still need to check if it feeds into a supported Op
-static bool DQFeedsASupportedOp(const Node* dq_node, const onnxruntime::GraphViewer& src_graph) {
+static bool DQFeedsASupportedOp(const Node* dq_node) {
   if (!dq_node->GetOutputEdgesCount()) return false;  // Only feeds the graph output, and not any node
 
   const auto& target_node = *dq_node->OutputNodesBegin();
-  const auto op_type = target_node.OpType();
+  const auto& op_type = target_node.OpType();
 
   if (op_type == "Conv" || op_type == "MatMul") {
     // Conv and MatMul always keeps int8 DQs except if the DQ is sandwiched between Softmax and Conv/MatMul
@@ -219,8 +219,8 @@ static bool DQFeedsASupportedOp(const Node* dq_node, const onnxruntime::GraphVie
       return true;
     }
   } else if (op_type == "Add") {
-    // Add keeps all DQs except if it has const inits
-    return !IsAnyDQAConstantInitializer(&target_node, src_graph);
+    // Add => keeps all DQs
+    return true;
   }
   return false;
 }
@@ -291,7 +291,7 @@ static bool CheckDQRuleSet(const NodeUnit& node_unit,
                            const onnxruntime::GraphViewer& src_graph,
                            SkipReason& reason) {
   const auto& target_node = node_unit.GetNode();
-  auto op_type = node_unit.OpType();
+  const auto& op_type = node_unit.OpType();
 
   // #1 Reverse DQ duplication
   if (dq_node->Name().find(DuplicateDQ) != std::string::npos) {
@@ -337,6 +337,18 @@ static bool CheckDQRuleSet(const NodeUnit& node_unit,
   }
 }
 
+static bool CheckQFeedsIntoQuantizedOutput(const NodeUnit& node_unit,
+                                           const std::unordered_map<std::string, std::string> graph_op_data_type) {
+  auto op_of_quantized_layer = node_unit.Outputs();
+  for (auto& itr : op_of_quantized_layer) {
+    auto it = graph_op_data_type.find(itr.node_arg.Name());
+    if (it != graph_op_data_type.end() && it->second == "tensor(uint8)") {
+      return true;
+    }
+  }
+  return false;
+}
+
 static bool CheckQRuleSet(const NodeUnit& node_unit,
                           const Node* q_node,
                           const onnxruntime::GraphViewer& src_graph,
@@ -345,7 +357,13 @@ static bool CheckQRuleSet(const NodeUnit& node_unit,
   // This Q should also be uint8
 
   const auto& target_node = node_unit.GetNode();
-  auto op_type = node_unit.OpType();
+  const auto& op_type = node_unit.OpType();
+
+  auto op = src_graph.GetOutputs();
+  std::unordered_map<std::string, std::string> graph_op_data_type;
+  for (auto& ops : op) {
+    graph_op_data_type[src_graph.GetNodeArg(ops->Name())->Name()] = ops->Type()->data();
+  }
 
   // If UInt16 Q, don't keep it
   if (GetQDQDataType(q_node) == DT_UINT16 || GetQDQDataType(q_node) == DT_INT16) {
@@ -359,6 +377,8 @@ static bool CheckQRuleSet(const NodeUnit& node_unit,
   } else if (op_type == "Add") {
     // Add keeps all Qs
     return true;
+  } else if (CheckQFeedsIntoQuantizedOutput(node_unit, std::move(graph_op_data_type))) {
+    return true;
   } else {
     // Keep Q of an unsupported Op only if the target that succeeds it is a supported Op in this list
     return IsNextTargetNodeOfQValid(q_node, &target_node, src_graph, {"Conv", "Add", "MatMul"}, false);
@@ -469,7 +489,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti
       add_identity_op(true);
     else if (IsConnectedQPresent(src_graph, dst_graph.Nodes(), &node_unit.GetNode(), node_unit.GetNode().InputDefs()))
       AddNode(initializers_to_keep, src_graph, dst_graph, node_unit.GetNode());
-    else if (DQFeedsASupportedOp(&node_unit.GetNode(), src_graph))
+    else if (DQFeedsASupportedOp(&node_unit.GetNode()))
       AddNode(initializers_to_keep, src_graph, dst_graph, node_unit.GetNode());
     else
       add_identity_op(false);
@@ -543,7 +563,7 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph,
 
   // Add Node args for inputs
   for (const auto& node_unit_input : node_unit_inputs) {
-    auto node_arg_name = node_unit_input.node_arg.Name();
+    const auto& node_arg_name = node_unit_input.node_arg.Name();
     if (auto dq_node_arg = dq_node_args_to_keep.find(node_arg_name); dq_node_arg != dq_node_args_to_keep.end()) {
       // Add supported DQ as an input arg for the target node
       input_args.push_back(dq_node_arg->second);
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 1d21933e9cba9..924158a26b927 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1931,12 +1931,31 @@ void ORTSessionOptionsToOrtOpenVINOProviderOptions(ProviderOptions& ov_options,
                                   kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
   if (disable_cpu_fallback)
     ov_options["disable_cpu_fallback"] = "true";
+
+  // values from session options will override the providerOptions Value
+  bool so_epctx_enable = session_options->config_options.GetConfigOrDefault(
+                             kOrtSessionOptionEpContextEnable, "0") == "1";
+  if (so_epctx_enable)
+    ov_options["so_export_ep_ctx_blob"] = "true";
+
+  std::string so_cache_path = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str();
+  ov_options["so_epctx_path"] = so_cache_path;
+
+  // Default embedMode is 1. Saving the compiled model contents as a Epctx node attribute
+  bool so_epctx_embed_mode = session_options->config_options.GetConfigOrDefault(
+                                 kOrtSessionOptionEpContextEmbedMode, "1") == "0";
+  if (so_epctx_embed_mode) {
+    // defaults to true
+    ov_options["so_epctx_embed_mode"] = "false";
+  }
 }
 
 std::shared_ptr<IExecutionProviderFactory> OpenVINOProviderFactoryCreator::Create(ProviderOptions* provider_options_map,
                                                                                   const SessionOptions* session_options) {
-  if (session_options)
+  // Append session options applicable for EP to EP Provider options.
+  if (session_options) {
     onnxruntime::ORTSessionOptionsToOrtOpenVINOProviderOptions(*provider_options_map, session_options);
+  }
   return s_library_openvino.Get().CreateExecutionProviderFactory(provider_options_map);
 }
 
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index e6d4e0a94abd3..84c3bc16346f3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -253,7 +253,6 @@ static bool ParseSessionConfigs(const std::string& configs_string,
           test_config.machine_config.provider_type_name = onnxruntime::kDnnlExecutionProvider;
         } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) {
           test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
-          test_config.run_config.optimization_level = ORT_DISABLE_ALL;
         } else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) {
           test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
         } else if (!CompareCString(optarg, ORT_TSTR("qnn"))) {
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 72b5da7aaec9b..fc1bdb10d7453 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -699,6 +699,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         std::set<std::string> deprecated_device_types = {"CPU_FP32", "GPU_FP32",
                                                          "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
                                                          "GPU.0_FP16", "GPU.1_FP16"};
+        size_t num_gpus = 10;
+        for (size_t i = 0; i <= num_gpus; i++) {
+          ov_supported_device_types.emplace("GPU." + std::to_string(i));
+        }
         if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
           ov_options[key] = value;
         } else if (deprecated_device_types.find(value) != deprecated_device_types.end()) {
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index d0e08448ce456..5f332ddcddb8d 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -25,7 +25,15 @@ struct DefaultTolerance<double> {
   static constexpr float relative = 1e-5f;
 
   // Allow to have different default absolute tolerance for different providers.
-  static float get_absolute(const std::string& /*provider_type*/) {
+  static float get_absolute(const std::string& provider_type /*provider_type*/) {
+    if (provider_type == kOpenVINOExecutionProvider) {
+#ifdef OPENVINO_CONFIG_NPU
+      return 0.005f;
+#else
+      return absolute;
+#endif
+    }
+
     return absolute;
   }
 };
@@ -40,7 +48,15 @@ struct DefaultTolerance<float> {
 
   static constexpr float relative = 1e-4f;
 
-  static float get_absolute(const std::string& /*provider_type*/) {
+  static float get_absolute(const std::string& provider_type /*provider_type*/) {
+    if (provider_type == kOpenVINOExecutionProvider) {
+#ifdef OPENVINO_CONFIG_NPU
+      return 0.005f;
+#else
+      return absolute;
+#endif
+    }
+
     return absolute;
   }
 };
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
index b05649dafc181..30960e71c577f 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc
@@ -98,8 +98,12 @@ static void RunGruTest(const std::vector<float>& X_data,
     test.AddOptionalOutputEdge<float>();
   }
 
-  // TensorRT failed on GRU tests
+// TensorRT, OpenVINO failed on GRU tests
+#if defined(USE_OPENVINO)
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+#endif
 }
 
 void DefaultActivationsSimpleWeightsNoBias(std::string direction,

From 6787cf18a5ee0196b376926b9a7080e925d4756d Mon Sep 17 00:00:00 2001
From: Yueqing Zhang <yuz75@Pitt.edu>
Date: Thu, 25 Jul 2024 07:18:55 -0700
Subject: [PATCH 16/57] [VitisAI] use binary mode for context ep (#21474)

### Description
<!-- Describe your changes. -->
We found text format could caused error.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Because the OS could change the string so we decided to save it as
binary file.
---
 .../core/providers/vitisai/vitisai_execution_provider.cc      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 036831df7a9cf..0f0972d96bcee 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -100,7 +100,7 @@ void VitisAIExecutionProvider::FulfillEPContextEnablement(
   auto& ep_ctx_graph = p_ep_ctx_model_->MainGraph();
   if (!ep_ctx_embed_mode_) {
     auto ep_ctx_cache_path_str = GetEPContextCacheFileLocation(ep_ctx_model_file_loc_, model_path_str_);
-    std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc);
+    std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc | std::ios::binary);
     if (!ep_ctx_cache_ofs.is_open()) {
       ORT_THROW("Failed to open a file to write EP context cache: ", ep_ctx_cache_path_str.c_str());
     }
@@ -136,7 +136,7 @@ std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCap
       info_["cacheDir"] = cache_dir;
       info_["cacheKey"] = cache_key;
       LOGS_DEFAULT(VERBOSE) << "Trying getting compilation cache from " << PathToUTF8String(ep_ctx_model_file_loc_);
-      auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, false);
+      auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, true);
       restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string());
     } else {
       if (fs::exists(ep_ctx_model_file_loc_) && fs::is_regular_file(ep_ctx_model_file_loc_) && ep_ctx_enabled_) {

From f3a6e58ae3358da65f1753ca2322e5f4475ae661 Mon Sep 17 00:00:00 2001
From: Sophie Schoenmeyer <107952697+sophies927@users.noreply.github.com>
Date: Thu, 25 Jul 2024 09:52:37 -0700
Subject: [PATCH 17/57] Update 05-performance.yml issue template to auto apply
 label (#21486)

Updating Performance issue template so "performance" label is
automatically applied

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .github/ISSUE_TEMPLATE/05-performance.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/ISSUE_TEMPLATE/05-performance.yml b/.github/ISSUE_TEMPLATE/05-performance.yml
index 829076a1bd466..da0e6c7ada7a7 100644
--- a/.github/ISSUE_TEMPLATE/05-performance.yml
+++ b/.github/ISSUE_TEMPLATE/05-performance.yml
@@ -1,6 +1,7 @@
 name: Performance
 description: issues related to performance
 title: "[Performance] "
+labels: ["performance"]
 body:
   - type: markdown
     attributes:

From ebcb7075ebd5657069c9b00be4bde0bc814307c6 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Thu, 25 Jul 2024 10:17:16 -0700
Subject: [PATCH 18/57] Set CUDA12 as default in GPU packages (#21438)

### Description
* Swap cuda version 11.8/12.2 in GPU CIs
* Set CUDA12 as default version in yamls of publishing nuget/python/java
GPU packages
* Suppress warnings as errors of flash_api.cc during ort win-build
---
 .../cuda/bert/flash_attention/flash_api.cc    |  8 ++++++
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  2 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  2 +-
 ...linux-gpu-tensorrt-daily-perf-pipeline.yml |  8 +++---
 .../nuget-cuda-publishing-pipeline.yml        | 15 ++++++-----
 .../github/azure-pipelines/publish-nuget.yml  | 26 +++++++++++++++----
 .../py-cuda-publishing-pipeline.yml           |  2 +-
 .../stages/java-cuda-publishing-stage.yml     |  2 +-
 .../jobs/download_win_gpu_library.yml         |  2 +-
 .../templates/jobs/set-winenv.yml             |  4 +--
 .../azure-pipelines/win-gpu-ci-pipeline.yml   | 13 +++++++---
 .../win-gpu-tensorrt-ci-pipeline.yml          | 19 ++++++++++++--
 .../docker/Dockerfile.manylinux2_28_cuda      |  2 +-
 .../Dockerfile.package_ubi8_cuda_tensorrt10_0 |  6 ++---
 .../github/windows/setup_env_cuda.bat         | 14 +++++-----
 .../ci_build/github/windows/setup_env_gpu.bat | 16 ++++++------
 .../ci_build/github/windows/setup_env_trt.bat |  8 +++---
 17 files changed, 97 insertions(+), 52 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index 90f0b94cafce8..967c04c52b182 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -92,6 +92,11 @@ void set_params_fprop(Flash_fwd_params& params,
   params.softmax_lse_ptr = softmax_lse_d;
 
   // Set the dimensions.
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4267)  // Ignore conversion from 'size_t' to 'int', possible loss of data
+#pragma warning(disable : 4244)  // Ignore conversion from 'double' to 'float', possible loss of data
+#endif
   params.b = batch_size;
   params.h = num_heads;
   params.h_k = num_heads_k;
@@ -119,6 +124,9 @@ void set_params_fprop(Flash_fwd_params& params,
   if (window_size_left >= 0 && window_size_right < 0) {
     window_size_right = seqlen_k;
   }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
   params.window_size_left = window_size_left;
   params.window_size_right = window_size_right;
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 30f56f4b18aec..d3e4a2e009598 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -30,7 +30,7 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 78e3b166995ec..5c7108861052e 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -30,7 +30,7 @@ parameters:
   - name: CudaVersion
     displayName: CUDA version
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 7cfff805c3b3c..4ab1b4996a1db 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,14 +8,12 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.0.cuda_11_8_cudnn_8
+  default: 10.2.cuda_12_5_cudnn_9
   values:
-  - 8.4.cuda_11_6_cudnn_8
-  - 8.5.cuda_11_8_cudnn_8
   - 8.6.cuda_11_8_cudnn_8
   - 8.6.cuda_12_3_cudnn_9
-  - 10.0.cuda_11_8_cudnn_8
-  - 10.0.cuda_12_4_cudnn_9
+  - 10.2.cuda_11_8_cudnn_8
+  - 10.2.cuda_12_5_cudnn_9
   - BIN
 
 - name: UseTensorrtOssParser
diff --git a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
index 4bfd726f5c58c..aeb250e1e0cbc 100644
--- a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
@@ -6,6 +6,7 @@ resources:
       branches:
         include:
         - main
+        - rel-*
     branch: main
 
 parameters:
@@ -16,15 +17,15 @@ parameters:
 variables:
   - name: ArtifactFeed
     ${{ if eq(parameters.isReleaseBuild, false) }}:
-      value: ort-cuda-12-nightly
+      value: ORT-Nightly
     ${{ else }}:
       value: onnxruntime-cuda-12
 
 stages:
-- template: stages/nuget-cuda-publishing-stage.yml
-  parameters:
-    artifact_feed: $(ArtifactFeed)
+  - template: stages/nuget-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
 
-- template: stages/java-cuda-publishing-stage.yml
-  parameters:
-    artifact_feed: $(ArtifactFeed)
+  - template: stages/java-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
index e0c588413415b..206a9464de6ef 100644
--- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml
+++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
@@ -9,10 +9,22 @@ resources:
         - rel-*
     branch: main
 
+parameters:
+  - name: isReleaseBuild
+    type: boolean
+    default: false
+
+variables:
+  - name: ArtifactFeed
+    ${{ if eq(parameters.isReleaseBuild, false) }}:
+      value: ort-cuda-11-nightly
+    ${{ else }}:
+      value: onnxruntime-cuda-11
+
 stages:
   - template: templates/publish-nuget-steps.yml
     parameters:
-      stage_name: 'Publish_NuGet_Packag_And_Report'
+      stage_name: 'Publish_NuGet_Package_And_Report'
       include_cpu_ep: true
       download_artifacts_steps:
         - download: build
@@ -25,7 +37,11 @@ stages:
           artifact: 'drop-signed-nuget-Training-CPU'
         - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-Training-CPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
 
-        - download: build
-          displayName: 'Download Pipeline Artifact - Signed NuGet Package'
-          artifact: 'drop-signed-nuget-GPU'
-        - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-GPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
+  # Publish CUDA 11 Nuget/Java pkgs to ADO feed
+  - template: stages/nuget-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
+
+  - template: stages/java-cuda-publishing-stage.yml
+    parameters:
+      artifact_feed: $(ArtifactFeed)
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
index 50e0ca3708d2d..1217163c07132 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml
@@ -16,7 +16,7 @@ parameters:
 variables:
   - name: ArtifactFeed
     ${{ if eq(parameters.isReleaseBuild, false) }}:
-      value: ort-cuda-12-nightly
+      value: ORT-Nightly
     ${{ else }}:
       value: onnxruntime-cuda-12
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
index 70d92286b3964..946d651b795d4 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml
@@ -8,7 +8,7 @@ stages:
   jobs:
   - job: JAR_Publishing_GPU
     #TD-DO: figure out a way to package nightly jar. Currently Java version are set from VERSION_NUMBER file
-    condition: ${{ eq(parameters.artifact_feed, 'onnxruntime-cuda-12') }}
+    condition: ${{ or(eq(parameters.artifact_feed, 'onnxruntime-cuda-11'), eq(parameters.artifact_feed, 'onnxruntime-cuda-12')) }}
     workspace:
       clean: all
     pool: 'onnxruntime-Win-CPU-2022'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index de29a3de9fded..6459888a40aea 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -7,7 +7,7 @@ parameters:
     default: false
   - name: CudaVersion
     type: string
-    default: '11.8'
+    default: '12.2'
     values:
       - 11.8
       - 12.2
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 63d521f1e7d9a..fba463b49016a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -9,10 +9,10 @@ parameters:
     default: false
   - name: PrimaryCUDAVersion
     type: string
-    default: '11.8'
+    default: '12.2'
   - name: SecondaryCUDAVersion
     type: string
-    default: '12.2'
+    default: '11.8'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 438e51175c5b4..c5262880c4c55 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -28,6 +28,13 @@ pr:
 #### end trigger ####
 
 parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
 - name: RunOnnxRuntimeTests
   displayName: Run Tests?
   type: boolean
@@ -43,7 +50,7 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: >-
-          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --enable_cuda_profiling --enable_transformers_tool_test
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
@@ -65,7 +72,7 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         additionalBuildFlags: >-
-          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --skip_onnx_tests
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
@@ -105,7 +112,7 @@ stages:
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
         additionalBuildFlags: >-
           --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
-          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
           --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 70c0c7d4a04e7..8c9ecdfb90191 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -26,6 +26,21 @@ pr:
     - 'js/web'
     - 'onnxruntime/core/providers/js'
 #### end trigger ####
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+variables:
+  - name: win_trt_folder
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5
 
 jobs:
 - job: 'build'
@@ -55,7 +70,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
-      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: 'x64'
       Platform: 'x64'
@@ -75,7 +90,7 @@ jobs:
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
      set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}"  --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
 
     workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index d96b342974273..07885ba65af8a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -2,7 +2,7 @@
 # Please overwrite BASEIMAGE, TRT_VERSION and other arguments with
 # --docker-build-args ' --build-arg BASEIMAGE=other_base_image --build-arg TRT_VERSION=other_trt_version etc...'
 # for other cuda version and TRT version
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
 
 FROM $BASEIMAGE
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 2d3dc05285e3c..b587a7df554bd 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -2,11 +2,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------
-# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default
+# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10 and CUDA 12 by default
 
 # Build base image with required system packages
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.2.0.19-1.cuda11.8
+ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
+ARG TRT_VERSION=10.2.0.19-1.cuda12.4
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat
index 2233f7611ab6a..f93938e2a9009 100644
--- a/tools/ci_build/github/windows/setup_env_cuda.bat
+++ b/tools/ci_build/github/windows/setup_env_cuda.bat
@@ -1,17 +1,17 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
 
-@REM The default version is still cuda v11.8, because set cuda v12.2 after it
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
 )
 
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index 6c59866ea925a..35e4f7e302430 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -1,17 +1,17 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-    set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH%
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+    set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH%
 ) else (
-    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
+    set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH%
 
-@REM The default version is still cuda v11.8, because set cuda v12.2 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib
-if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
+@REM The default version is still cuda v12.2, because set cuda v11.8 after it
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib
+if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\extras\CUPTI\lib64
 )
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 249bb98815897..7ec7558edab39 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -1,11 +1,11 @@
 REM Copyright (c) Microsoft Corporation. All rights reserved.
 REM Licensed under the MIT License.
 
-if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
-    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
+if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
+    set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
 ) else (
-    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
+    set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
\ No newline at end of file

From 4167b68abf2715c52439874ad9ddeaebfb3dafcb Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 25 Jul 2024 10:58:34 -0700
Subject: [PATCH 19/57] Split ondevice training cpu packaging pipeline to a
 separated pipeline (#21485)

### Description
Right now our "Zip-Nuget-Java-Nodejs Packaging Pipeline" is too big.
This OnDevice training part is independent of the others, so it can be
split out. Then our NPM Packaging pipeline will not depends on this
training stuff.

### Motivation and Context
Similar to #21235

Also, this PR fixed a problem that: "NuGet_Test_Linux_Training_CPU" job
downloads artifacts from "onnxruntime-linux-x64" for getting customop
shared libs, but the job forget to declare it depends on the
"Linux_C_API_Packaging_CPU_x64" which produces the artifact. Such
problems can be hard to find when a pipeline goes big.
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 11 ----
 .../c-api-training-packaging-pipelines.yml    | 51 +++++++++++++++++++
 ...device-training-cpu-packaging-pipeline.yml |  1 +
 3 files changed, 52 insertions(+), 11 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 2eb7046d80e7a..51b73acd93dc8 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -112,17 +112,6 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
-- template: templates/ondevice-training-cpu-packaging-pipeline.yml
-  parameters:
-    RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training'
-    AdditionalBuildFlags: '--enable_training_apis'
-    AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
-    BuildVariant: 'default'
-
 - template: stages/java-cuda-packaging-stage.yml
   parameters:
     CudaVersion: 11.8
diff --git a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml
new file mode 100644
index 0000000000000..aecece05a0e58
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml
@@ -0,0 +1,51 @@
+parameters:
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+- name: DoCompliance
+  displayName: Run Compliance Tasks?
+  type: boolean
+  default: true
+
+- name: DoEsrp
+  displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release
+  type: boolean
+  default: true
+
+- name: IsReleaseBuild
+  displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release.
+  type: boolean
+  default: false
+- name: PreReleaseVersionSuffixString
+  displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
+  type: string
+  values:
+  - alpha
+  - beta
+  - rc
+  - none
+  default: none
+
+- name: PreReleaseVersionSuffixNumber
+  displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
+  type: number
+  default: 0
+  
+stages:
+- template: stages/set_packaging_variables_stage.yml
+  parameters:
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+    PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+- template: templates/ondevice-training-cpu-packaging-pipeline.yml
+  parameters:
+    RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training'
+    AdditionalBuildFlags: '--enable_training_apis'
+    AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
+    BuildVariant: 'default'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index fb9ff65fe8534..022f85cc0a463 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -317,3 +317,4 @@ stages:
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
     NativePackagePrefix: 'onnxruntime-training'
+    CustomOpArtifactName: 'onnxruntime-training-linux-x64'

From c23517859eb67a0a03f9777e9c741b9ebaabd6eb Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 25 Jul 2024 11:44:10 -0700
Subject: [PATCH 20/57] Qnn batchnorm support input with rank 2 (#21469)

### Description
Qnn BatchNorm support input with rank 2
Update Quantization script to quantize BatchNorm bias using int32

---------

Co-authored-by: Justin Chu <justinchuby@users.noreply.github.com>
---
 .../selectors_actions/qdq_selectors.cc        |  2 +-
 .../opbuilder/batch_norm_op_builder.cc        | 21 +++-
 .../builder/opbuilder/expand_op_builder.cc    |  2 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 23 ++++-
 .../core/providers/qnn/builder/qnn_utils.h    |  5 +-
 .../tools/quantization/operators/norm.py      |  2 +-
 .../python/tools/quantization/registry.py     |  1 +
 .../test/providers/qnn/batch_norm_htp_test.cc | 99 ++++++++++++-------
 .../test/providers/qnn/qnn_test_utils.h       |  6 +-
 9 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 6e93445c7c5c7..e271ae8df3356 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -632,7 +632,7 @@ bool BatchNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
                                                 const Node& node,
                                                 const std::vector<const Node*>& dq_nodes,
                                                 const std::vector<const Node*>& q_nodes) const {
-  if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes)) {
+  if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, 3)) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 16a058854a743..07abcf1c7bf84 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -392,15 +392,23 @@ class BatchNormOpBuilder : public BaseOpBuilder {
                      const double rmin,
                      QnnQuantParamsWrapper& quant_param,
                      std::vector<uint8_t>& raw_tensor) const {
+    bool symmetric = false;
     if (info.quant_param.IsQuantized()) {
-      raw_tensor.resize(double_tensor.size());
+      size_t data_size = double_tensor.size();
+      // QNN BatchNorm int32 bias requires symmetric quantizated
+      if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+        data_size *= sizeof(int32_t);
+        symmetric = true;
+      }
+      raw_tensor.resize(data_size);
       float scale = 0.0f;
-      int zero_point = 0;
+      int32_t zero_point = 0;
       ORT_RETURN_IF_ERROR(utils::GetQuantParams(static_cast<float>(rmin),
                                                 static_cast<float>(rmax),
                                                 info.qnn_data_type,
                                                 scale,
-                                                zero_point));
+                                                zero_point,
+                                                symmetric));
       quant_param = QnnQuantParamsWrapper(scale, zero_point);
       for (size_t i = 0; i < double_tensor.size(); ++i) {
         // onnx only supports 8 bits quantization
@@ -411,6 +419,10 @@ class BatchNormOpBuilder : public BaseOpBuilder {
         } else if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
           int8_t quant_value = static_cast<int8_t>(quant_value_int);
           raw_tensor[i] = *reinterpret_cast<uint8_t*>(&quant_value);
+        } else if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+          int32_t quant_value = static_cast<int32_t>(quant_value_int);
+          size_t pos = i * sizeof(int32_t);
+          std::memcpy(&raw_tensor[pos], reinterpret_cast<uint8_t*>(&quant_value), sizeof(int32_t));
         } else {
           // TODO(adrianlizarraga): Should support 16-bit quantization as well.
           ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", info.qnn_data_type);
@@ -444,8 +456,7 @@ Status BatchNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
     const size_t input_rank = input_shape.size();
 
-    ORT_RETURN_IF(input_rank <= 2 || input_rank > 4,
-                  "QNN BatchNorm only supports input ranks of size 3 or 4.");
+    ORT_RETURN_IF(input_rank > 4, "QNN BatchNorm only supports input ranks of size <= 4.");
 
     const uint32_t num_channels = input_shape[1];
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index d0f6ce9effd9e..64f676aaa9875 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -79,7 +79,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   if (is_quantized_tensor) {
     ORT_RETURN_IF_ERROR(utils::GetQnnDataType(true, type_proto, qnn_data_type));
     float scale = 0.0f;
-    int zero_point = 0;
+    int32_t zero_point = 0;
     float rmax = 1.0f;
     float rmin = 1.0f;
     ORT_RETURN_IF_ERROR(utils::GetQuantParams(rmin,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index c2e500b8980ad..d6c93a8f226e8 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -509,6 +509,9 @@ Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
   } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
     qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
     qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+    qmin = static_cast<T>(std::numeric_limits<int32_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
   } else {
     ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
   }
@@ -519,15 +522,27 @@ Status GetQuantParams(float rmin,
                       float rmax,
                       const Qnn_DataType_t qnn_data_type,
                       float& scale,
-                      int& zero_point) {
+                      int32_t& zero_point,
+                      bool symmetric) {
   std::tie(rmin, rmax) = CheckMinMax(rmin, rmax);
+  if (symmetric) {
+    float abs_max = std::max(abs(rmax), abs(rmin));
+    rmax = abs_max;
+    rmin = -abs_max;
+  }
+
   float qmin = 0.0f;
   float qmax = 255.0f;
   ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax));
 
   scale = (rmax - rmin) / (qmax - qmin);
-  const float initial_zero_point = qmin - (rmin / scale);
-  zero_point = static_cast<int>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
+  float initial_zero_point = 0.0f;
+  if (symmetric) {
+    initial_zero_point = std::round(rmin + rmax) / 2;
+  } else {
+    initial_zero_point = qmin - (rmin / scale);
+  }
+  zero_point = static_cast<int32_t>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
   // To match QNN quantization definition
   zero_point = 0 - zero_point;
   return Status::OK();
@@ -541,7 +556,7 @@ double Dequantize(int32_t offset, float scale, const double quant_value) {
 
 Status Quantize(const double double_value,
                 const float scale,
-                const int zero_point,
+                const int32_t zero_point,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value) {
   int qmin = 0;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 2392040d284b7..aa4a27460563f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -93,13 +93,14 @@ Status GetQuantParams(float rmin,
                       float rmax,
                       const Qnn_DataType_t qnn_data_type,
                       float& scale,
-                      int& zero_point);
+                      int32_t& zero_point,
+                      bool symmetric = false);
 
 double Dequantize(int32_t offset, float scale, const double quant_value);
 
 Status Quantize(const double double_value,
                 const float scale,
-                const int zero_point,
+                const int32_t zero_point,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
diff --git a/onnxruntime/python/tools/quantization/operators/norm.py b/onnxruntime/python/tools/quantization/operators/norm.py
index 8c4c6c78582ac..10d96cc49855e 100644
--- a/onnxruntime/python/tools/quantization/operators/norm.py
+++ b/onnxruntime/python/tools/quantization/operators/norm.py
@@ -12,7 +12,7 @@ def __init__(self, onnx_quantizer, onnx_node):
 
     def quantize(self):
         node = self.node
-        assert node.op_type == "InstanceNormalization" or node.op_type == "LayerNormalization"
+        assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
 
         # Input
         self.quantizer.quantize_activation_tensor(node.input[0])
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index b00e830a2a366..caac829126e38 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -82,6 +82,7 @@
     "Where": QDQWhere,
     "InstanceNormalization": QDQNormalization,
     "LayerNormalization": QDQNormalization,
+    "BatchNormalization": QDQNormalization,
 }
 
 
diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
index 036c5760ed560..0a39413a4ec1b 100644
--- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
@@ -80,8 +80,7 @@ template <typename FLOAT_TYPE>
 static GetTestModelFn BuildBatchNormTestCase(const TestInputDef<FLOAT_TYPE>& input_def,
                                              const TestInputDef<FLOAT_TYPE>& scale_def,
                                              const TestInputDef<FLOAT_TYPE>& bias_def) {
-  ORT_ENFORCE(input_def.IsRawData());            // Need raw data to compute mean and variance inputs.
-  ORT_ENFORCE(input_def.GetShape().size() > 2);  // Need at least rank 3 data for convenience.
+  ORT_ENFORCE(input_def.IsRawData());  // Need raw data to compute mean and variance inputs.
 
   return [input_def, scale_def, bias_def](ModelTestBuilder& builder) {
     const auto& input_shape = input_def.GetShape();
@@ -103,45 +102,39 @@ static GetTestModelFn BuildBatchNormTestCase(const TestInputDef<FLOAT_TYPE>& inp
   };
 }
 
-template <typename InputQType, typename ScaleQType, typename BiasQType>
+template <typename InputQType, typename ScaleQType>
 GetTestQDQModelFn<InputQType> BuildQDQBatchNormTestCase(const TestInputDef<float>& input_def,
                                                         const TestInputDef<float>& scale_def,
                                                         const TestInputDef<float>& bias_def) {
-  ORT_ENFORCE(input_def.IsRawData());            // Need raw data to compute mean and variance inputs.
-  ORT_ENFORCE(input_def.GetShape().size() > 2);  // Need at least rank 3 data for convenience.
+  ORT_ENFORCE(input_def.IsRawData());  // Need raw data to compute mean and variance inputs.
 
   return [input_def, scale_def, bias_def](ModelTestBuilder& builder,
                                           std::vector<QuantParams<InputQType>>& output_qparams) {
     const auto& input_shape = input_def.GetShape();
     const auto& input_data = input_def.GetRawData();
     const int64_t num_channels = input_shape[1];
-
+    bool symmetric = sizeof(InputQType) == sizeof(uint16_t);
     NodeArg* input = MakeTestInput(builder, input_def);
-    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def);
+    QuantParams<InputQType> input_qparams = GetTestInputQuantParams<InputQType>(input_def, symmetric);
     NodeArg* input_qdq = AddQDQNodePair<InputQType>(builder, input, input_qparams.scale, input_qparams.zero_point);
 
     NodeArg* scale = MakeTestInput(builder, scale_def);
     QuantParams<ScaleQType> scale_qparams = GetTestInputQuantParams<ScaleQType>(scale_def);
     NodeArg* scale_qdq = AddQDQNodePair<ScaleQType>(builder, scale, scale_qparams.scale, scale_qparams.zero_point);
 
-    NodeArg* bias = MakeTestInput(builder, bias_def);
-    QuantParams<BiasQType> bias_qparams = GetTestInputQuantParams<BiasQType>(bias_def);
-    NodeArg* bias_qdq = AddQDQNodePair<BiasQType>(builder, bias, bias_qparams.scale, bias_qparams.zero_point);
+    NodeArg* bias_qdq;
+    // bias (as int32) => DQ =>
+    bias_qdq = MakeTestQDQBiasInput(builder, bias_def, input_qparams.scale * scale_qparams.scale, true);
 
     std::vector<float> mean_vals(num_channels);
     std::vector<float> var_vals(num_channels);
     ComputeChannelMeanAndVar(input_data, input_shape, mean_vals, var_vals);
 
     NodeArg* mean = builder.MakeInitializer<float>({num_channels}, mean_vals);
-    QuantParams<InputQType> mean_qparams = GetDataQuantParams(mean_vals);
-    NodeArg* mean_qdq = AddQDQNodePair<InputQType>(builder, mean, mean_qparams.scale, mean_qparams.zero_point);
-
     NodeArg* var = builder.MakeInitializer<float>({num_channels}, var_vals);
-    QuantParams<InputQType> var_qparams = GetDataQuantParams(var_vals);
-    NodeArg* var_qdq = AddQDQNodePair<InputQType>(builder, var, var_qparams.scale, var_qparams.zero_point);
 
     auto* batchnorm_output = builder.MakeIntermediate();
-    builder.AddNode("BatchNormalization", {input_qdq, scale_qdq, bias_qdq, mean_qdq, var_qdq},
+    builder.AddNode("BatchNormalization", {input_qdq, scale_qdq, bias_qdq, mean, var},
                     {batchnorm_output});
 
     AddQDQNodePairWithOutputAsGraphOutput<InputQType>(builder, batchnorm_output, output_qparams[0].scale, output_qparams[0].zero_point);
@@ -155,6 +148,7 @@ GetTestQDQModelFn<InputQType> BuildQDQBatchNormTestCase(const TestInputDef<float
  * \param input_shape The input's shape.
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
  */
+template <typename InputQType, typename ScaleQType>
 static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
                                 const TestInputDef<float>& scale_def,
                                 const TestInputDef<float>& bias_def,
@@ -169,9 +163,9 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
   TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
-                       BuildQDQBatchNormTestCase<uint8_t, uint8_t, uint8_t>(input_def, scale_def, bias_def),
+                       BuildQDQBatchNormTestCase<InputQType, ScaleQType>(input_def, scale_def, bias_def),
                        provider_options,
-                       11,
+                       21,
                        expected_ep_assignment,
                        tolerance);
 }
@@ -199,31 +193,69 @@ static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
                         expected_ep_assignment);
 }
 
+// BatchNor QDQ model, input with rank 2.
+TEST_F(QnnHTPBackendTests, BatchNormRank2) {
+  constexpr int64_t num_channels = 2;
+
+  RunBatchNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({4, num_channels}, false,
+                                                            {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f}),  // Input data
+                                        TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),                    // Scale initializer
+                                        TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),                    // Bias initializer
+                                        ExpectedEPNodeAssignment::All);
+}
+
 // TODO: FIX TRANSLATION!!!
 // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
 // Use an input of rank 3.
+// Accuracy issue with Linux simulator, not sure with Android device
+// Inaccuracy detected for output 'output_0', element 1
+// output_range=4.8666362762451172, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): 1.0999999046325684
+// qdq@QNN_EP val: -0.17176364362239838 (err: 1.2717635631561279, err/output_range: 26.132291793823242%)
+// qdq@CPU_EP val: 1.1069211959838867 (err: 0.0069212913513183594, err/output_range: 0.14221921563148499%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 25.990072250366211%
+//
+// Inaccuracy detected for output 'output_0', element 2
+// output_range=4.8666362762451172, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): 2.3247356414794922
+// qdq@QNN_EP val: -0.17176364362239838 (err: 2.4964993000030518, err/output_range: 51.298248291015625%)
+// qdq@CPU_EP val: 2.3474364280700684 (err: 0.022700786590576172, err/output_range: 0.46645742654800415%)
+#if defined(_WIN32)
 TEST_F(QnnHTPBackendTests, BatchNorm1D) {
   constexpr int64_t num_channels = 2;
 
-  RunBatchNormQDQTest(TestInputDef<float>({1, num_channels, 3}, false, {-5.0f, -4.0f, -3.0f, 0.0f, 2.0f, 5.0f}),  // Input data
-                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),                                    // Scale initializer
-                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),                                    // Bias initializer
-                      ExpectedEPNodeAssignment::All);
+  RunBatchNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, num_channels, 3}, false,
+                                                            {-5.0f, -4.0f, -3.0f, 0.0f, 2.0f, 5.0f}),  // Input data
+                                        TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),       // Scale initializer
+                                        TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),       // Bias initializer
+                                        ExpectedEPNodeAssignment::All);
+}
+#endif
+
+// Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
+// Use an input of rank 4.
+TEST_F(QnnHTPBackendTests, BatchNorm2D_a8w8) {
+  constexpr int64_t num_channels = 2;
+  std::vector<float> input_data = {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f,
+                                   -7.0f, -5.0f, -3.0f, -1.0f, 0.0f, 2.1f, 4.3f, 7.0f};
+
+  RunBatchNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({2, num_channels, 2, 2}, false, input_data),  // Input data
+                                        TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),          // Scale initializer
+                                        TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),          // Bias initializer
+                                        ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit.
 // Use an input of rank 4.
-TEST_F(QnnHTPBackendTests, BatchNorm2D) {
+TEST_F(QnnHTPBackendTests, BatchNorm2D_a16w8) {
   constexpr int64_t num_channels = 2;
   std::vector<float> input_data = {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f,
                                    -7.0f, -5.0f, -3.0f, -1.0f, 0.0f, 2.1f, 4.3f, 7.0f};
 
-  RunBatchNormQDQTest(TestInputDef<float>({2, num_channels, 2, 2}, false, input_data),  // Input data
-                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),          // Scale initializer
-                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),          // Bias initializer
-                      ExpectedEPNodeAssignment::All,
-                      // Require a slightly increased tolerance on Windows ARM64 (from 0.4% to 0.6%).
-                      QDQTolerance(0.006f));
+  RunBatchNormQDQTest<uint16_t, uint8_t>(TestInputDef<float>({2, num_channels, 2, 2}, false, input_data),  // Input data
+                                         TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),          // Scale initializer
+                                         TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),          // Bias initializer
+                                         ExpectedEPNodeAssignment::All);
 }
 
 // Test FP16 BatchNormalization on the HTP backend.
@@ -272,10 +304,11 @@ TEST_F(QnnHTPBackendTests, BatchNorm_FP32_as_FP16) {
 TEST_F(QnnHTPBackendTests, BatchNorm3D) {
   constexpr int64_t num_channels = 2;
   constexpr int64_t num_elems = 1 * num_channels * 3 * 4 * 5;
-  RunBatchNormQDQTest(TestInputDef<float>({1, num_channels, 3, 4, 5}, false, std::vector<float>(num_elems)),  // Input data (all zeros)
-                      TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),                                // Scale initializer
-                      TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),                                // Bias initializer
-                      ExpectedEPNodeAssignment::None);
+  RunBatchNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, num_channels, 3, 4, 5}, false,
+                                                            std::vector<float>(num_elems)),       // Input data (all zeros)
+                                        TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),  // Scale initializer
+                                        TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),  // Bias initializer
+                                        ExpectedEPNodeAssignment::None);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h
index eb03270dc8461..3a6753e9b6131 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.h
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h
@@ -42,7 +42,7 @@ struct QuantParams {
         symmetric);
   }
 
-  static QuantParams<QType> Compute(float rmin, float rmax, QType qmin, QType qmax, bool symmetric = false) {
+  static QuantParams<QType> Compute(float rmin, float rmax, float qmin, float qmax, bool symmetric = false) {
     // Ensure a minimum range of 0.0001 (required by QNN)
     rmax = std::max(rmax, rmin + 0.0001f);
 
@@ -56,8 +56,8 @@ struct QuantParams {
       rmin = -abs_max;
     }
 
-    float qmin_flt = static_cast<float>(qmin);
-    float qmax_flt = static_cast<float>(qmax);
+    float qmin_flt = qmin;
+    float qmax_flt = qmax;
     const float scale = (rmax - rmin) / (qmax_flt - qmin_flt);
     float initial_zero_point = 0.0f;
 

From 3cdf4b917b4c679f3f4152145f36c7705b12d2c3 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 26 Jul 2024 07:36:23 +1000
Subject: [PATCH 21/57] Fix Android CI Pipeline code coverage failure (#21504)

### Description
<!-- Describe your changes. -->
Current failure is due to a version mismatch.

Use llvm-cov from the Android NDK instead of the system gcov so that the
version is correct.

Also comment out publishing to the Azure dashboard to simplify the
setup. The CI prints out the stats for review by developers.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix CI pipeline
---
 ...ndroid-x86_64-crosscompile-ci-pipeline.yml | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 10d9a9a24d88a..bcfe4cde9ce50 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -174,10 +174,10 @@ stages:
 
     - template: templates/clean-agent-build-directory-step.yml
 
-- stage: MASTER_BUILD_STAGE
-  # The below jobs only run on master build.
+- stage: MAIN_BUILD_STAGE
+  # The below jobs only run on build of main branch.
   # because coverage report is hard to support in cross machines.
-  displayName: NNAPI MASTER BUILD&TEST
+  displayName: NNAPI MAIN BUILD&TEST
   dependsOn: []
   condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
   jobs:
@@ -225,29 +225,29 @@ stages:
           --code_coverage
         displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator
 
+      # We need to use llvm-cov from the NDK.
       - script: |
-          python3 -m pip install gcovr && \
-          python3 tools/ci_build/coverage.py \
-            --build_dir build_nnapi \
-            --android_sdk_path $ANDROID_HOME
+          export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov"
+          python3 -m pip install gcovr
+          python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME
         displayName: Retrieve runtime code coverage files from the emulator and analyze
 
       - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
         displayName: Print coverage report
 
+      # - task: AzureCLI@2
+      #   displayName: 'Post Android Code Coverage To DashBoard'
+      #   inputs:
+      #     azureSubscription: AIInfraBuild
+      #     scriptType: bash
+      #     scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
+      #     arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
+      #     workingDirectory: '$(Build.BinariesDirectory)'
+
       - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
         # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
         displayName: Build Minimal ORT with NNAPI and run tests
 
-      - task: AzureCLI@2
-        displayName: 'Post Android Code Coverage To DashBoard'
-        inputs:
-          azureSubscription: AIInfraBuild
-          scriptType: bash
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
-          arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
-          workingDirectory: '$(Build.BinariesDirectory)'
-
       - template: templates/use-android-emulator.yml
         parameters:
           stop: true

From b0e1f7f7988952166ec867600d9eb92fde0be157 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 26 Jul 2024 08:29:33 +1000
Subject: [PATCH 22/57] CoreML: Aggregated changes to add all required ops for
 priority model (#21472)

### Description
<!-- Describe your changes. -->
Add these changes to one PR to simplify checkin
- Add Concat (#21423)
- Add DepthToSpace (#21426)
- Add LeakyRelu (#21453)
- Add test scripts (#21427)
- Add ability to set coreml flags from python (#21434)


Other changes
- updated partitioning utils to support dropping constant initializers
from a ComputeCapability's inputs.
- noticed that the list of inputs to the coreml model was unexpectedly
long due to this
- we copy constant initializers to a CoreML model so don't need the
originals, and if they remain as inputs ORT can't free them as they
appear to be in use.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .lintrunner.toml                              |   1 +
 include/onnxruntime/core/graph/graph.h        |  24 ++--
 onnxruntime/core/graph/graph.cc               |  60 +++++----
 .../builders/impl/activation_op_builder.cc    |  13 +-
 .../coreml/builders/impl/builder_utils.cc     |  24 +++-
 .../coreml/builders/impl/builder_utils.h      |  20 +++
 .../coreml/builders/impl/concat_op_builder.cc |  85 ++++++++----
 .../builders/impl/depthtospace_op_builder.cc  | 124 +++++++++++++++---
 .../builders/impl/gridsample_op_builder.cc    |   4 +-
 .../coreml/builders/op_builder_factory.cc     |  23 ++--
 .../coreml/coreml_execution_provider.cc       |   4 +-
 .../DebugMLProgram.md                         |   2 +
 .../mlprogram_test_scripts/concat_test.py     |  33 +++++
 .../convtranspose_test.py                     |  42 ++++++
 .../depthtospace_test.py                      |  51 +++++++
 .../coreml/mlprogram_test_scripts/div_test.py | 103 +++++++++++++++
 .../dump_mlprogram_model.py                   |   0
 .../mlprogram_test_scripts/gridsample_test.py | 114 ++++++++++++++++
 .../mlprogram_test_scripts/resize_test.py     |  51 +++++++
 .../core/providers/partitioning_utils.cc      |  39 +++---
 .../core/providers/partitioning_utils.h       |  25 ++--
 .../providers/qnn/qnn_execution_provider.cc   |   5 +-
 onnxruntime/core/session/inference_session.cc |   5 +
 .../python/onnxruntime_pybind_state.cc        |  29 +++-
 .../test/optimizer/qdq_transformer_test.cc    |   3 +-
 .../cpu/tensor/space_depth_ops_test.cc        |  31 +++++
 .../apple/coreml_supported_mlprogram_ops.md   |   5 +-
 27 files changed, 783 insertions(+), 137 deletions(-)
 rename onnxruntime/core/providers/coreml/{ => mlprogram_test_scripts}/DebugMLProgram.md (97%)
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py
 rename onnxruntime/core/providers/coreml/{ => mlprogram_test_scripts}/dump_mlprogram_model.py (100%)
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py
 create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py

diff --git a/.lintrunner.toml b/.lintrunner.toml
index e6d06b34726fe..e1b24b2955b03 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -137,6 +137,7 @@ exclude_patterns = [
     'onnxruntime/core/mickey/gemm/**', # CUTLASS based libs recommends NO automatic code formatting
     'winml/lib/Api.Image/shaders/**',  # Contains data chunks
     'onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h', # Bool Switches hang Clang
+    'onnxruntime/core/providers/coreml/mlprogram_test_scripts/**',  # test scripts only
 ]
 command = [
     'python',
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 9289e14c17dd1..c51f38553c3b4 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1408,6 +1408,11 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   RuntimeOptimizationRecordContainer& MutableRuntimeOptimizations() {
     return runtime_optimizations_;
   }
+
+  // We don't run Graph::Resolve() on an ORT format model, but a compiling EP may copy initializers to its
+  // compiled model during partitioning, leaving them unused in the ORT Graph. To allow the memory to be freed
+  // we need to manually run the cleanup that would usually happen as part of Graph::Resolve.
+  Status RemovedUnusedInitializersOrtFormat();
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
   // This friendship relationship should only be used to call Graph::Graph and
@@ -1541,12 +1546,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   common::Status PerformTypeAndShapeInferencing(const ResolveOptions& options);
 
-  // Recursively find all subgraphs including nested subgraphs
-  void FindAllSubgraphs(std::vector<Graph*>& subgraphs);
-
-  // Iterate this Graph instance and all subgraphs, calling the provided function for each.
-  common::Status ForThisAndAllSubgraphs(const std::vector<Graph*>& subgraphs, std::function<Status(Graph&)> func);
-
   common::Status InferAndVerifyTypeMatch(Node& node, const ONNX_NAMESPACE::OpSchema& op, const ResolveOptions& options);
 
   // perform type and shape inferencing on the subgraph and Resolve to validate
@@ -1576,9 +1575,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // Implementation for initializer replacement
   Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external);
 
-  // Clear all unused initializers and NodeArgs
-  void CleanUnusedInitializersAndNodeArgs(const std::unordered_set<std::string>* initializer_names_to_preserve = nullptr);
-
   std::vector<NodeArg*> CreateNodeArgs(const google::protobuf::RepeatedPtrField<std::string>& names,
                                        const ArgNameToTypeMap& name_to_type_map);
 
@@ -1587,6 +1583,16 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  // Recursively find all subgraphs including nested subgraphs
+  void FindAllSubgraphs(std::vector<Graph*>& subgraphs);
+
+  // Iterate this Graph instance and all subgraphs, calling the provided function for each.
+  common::Status ForThisAndAllSubgraphs(const std::vector<Graph*>& subgraphs, std::function<Status(Graph&)> func);
+
+  // Clear all unused initializers and NodeArgs
+  void CleanUnusedInitializersAndNodeArgs(const std::unordered_set<std::string>* initializer_names_to_preserve = nullptr);
+
   Status PopulateNodeArgToProducerConsumerLookupsFromNodes();
 
   template <typename TInstance>
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 442a0db933d65..e950d68947b91 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3254,27 +3254,6 @@ Status Graph::PerformTypeAndShapeInferencing(const ResolveOptions& options) {
   return Status::OK();
 }
 
-void Graph::FindAllSubgraphs(std::vector<Graph*>& subgraphs) {
-  for (auto& node : Nodes()) {
-    for (auto& subgraph : node.MutableSubgraphs()) {
-      subgraphs.push_back(subgraph.get());
-      subgraph->FindAllSubgraphs(subgraphs);
-    }
-  }
-}
-
-Status Graph::ForThisAndAllSubgraphs(const std::vector<Graph*>& subgraphs, std::function<Status(Graph&)> func) {
-  auto status = func(*this);
-  ORT_RETURN_IF_ERROR(status);
-
-  for (auto& subgraph : subgraphs) {
-    status = func(*subgraph);
-    ORT_RETURN_IF_ERROR(status);
-  }
-
-  return status;
-}
-
 Status Graph::Resolve(const ResolveOptions& options) {
   if (parent_graph_) {
     // Resolve must start at the top level graph in-order to handle outer scope
@@ -3387,6 +3366,39 @@ void Graph::AddInitializedTensor(const TensorProto& tensor) {
     ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor.name(), &t));
   }
 }
+
+void Graph::FindAllSubgraphs(std::vector<Graph*>& subgraphs) {
+  for (auto& node : Nodes()) {
+    for (auto& subgraph : node.MutableSubgraphs()) {
+      subgraphs.push_back(subgraph.get());
+      subgraph->FindAllSubgraphs(subgraphs);
+    }
+  }
+}
+
+Status Graph::ForThisAndAllSubgraphs(const std::vector<Graph*>& subgraphs, std::function<Status(Graph&)> func) {
+  auto status = func(*this);
+  ORT_RETURN_IF_ERROR(status);
+
+  for (auto& subgraph : subgraphs) {
+    status = func(*subgraph);
+    ORT_RETURN_IF_ERROR(status);
+  }
+
+  return status;
+}
+
+Status Graph::RemovedUnusedInitializersOrtFormat() {
+  std::vector<Graph*> all_subgraphs;
+  FindAllSubgraphs(all_subgraphs);
+  auto cleanup_func = [](Graph& graph) {
+    graph.CleanUnusedInitializersAndNodeArgs(nullptr);
+    return Status::OK();
+  };
+
+  auto result = ForThisAndAllSubgraphs(all_subgraphs, cleanup_func);
+  return result;
+}
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 const std::string& Graph::Name() const noexcept {
@@ -4122,6 +4134,9 @@ void Graph::ToGraphProtoInternal(ONNX_NAMESPACE::GraphProto& graph_proto) const
   }
 }
 
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 void Graph::CleanUnusedInitializersAndNodeArgs(const std::unordered_set<std::string>* initializer_names_to_preserve) {
   // Node Args being used
   std::unordered_set<const NodeArg*> used_args;
@@ -4253,8 +4268,7 @@ void Graph::CleanUnusedInitializersAndNodeArgs(const std::unordered_set<std::str
     }
   }
 }
-
-#endif  // !defined(ORT_MINIMAL_BUILD)
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 void Graph::ComputeOverridableInitializers() {
   graph_overridable_initializers_.clear();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 0e21715513707..c8670cd546253 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -83,12 +83,16 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
     std::string_view coreml_op_type;
+    bool add_alpha = false;
     if (op_type == "Sigmoid") {
       coreml_op_type = "sigmoid";
     } else if (op_type == "Tanh") {
       coreml_op_type = "tanh";
     } else if (op_type == "Relu") {
       coreml_op_type = "relu";
+    } else if (op_type == "LeakyRelu") {
+      coreml_op_type = "leaky_relu";
+      add_alpha = true;
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
@@ -96,6 +100,13 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+
+    if (add_alpha) {
+      NodeAttrHelper helper(node);
+      const auto alpha = helper.Get("alpha", 0.01f);
+      AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", alpha));
+    }
+
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
@@ -198,7 +209,7 @@ bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInp
 
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
-    if (op_type == "PRelu" || op_type == "LeakyRelu") {
+    if (op_type == "PRelu") {  // TODO: ML Program supports this so should be easy to enable
       return false;
     }
   } else
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index ebb3f97895f06..e02186d3aee89 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -309,11 +309,33 @@ COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& n
 
 void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std::string_view value_name) {
   MILSpec::Argument arg;
-  arg.mutable_arguments()->Add()->set_name(std::string(value_name));
+  arg.mutable_arguments()->Add()->set_name(value_name.data(), value_name.size());
 
   (*op.mutable_inputs())[input_name] = std::move(arg);
 }
 
+void AddOperationVariadicInput(MILSpec::Operation& op, std::string_view input_name,
+                               const std::vector<std::string_view>& value_names) {
+  MILSpec::Argument arg;
+  for (const auto& value : value_names) {
+    arg.mutable_arguments()->Add()->set_name(value.data(), value.size());
+  }
+
+  (*op.mutable_inputs())[input_name] = std::move(arg);
+}
+
+void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, std::string_view output_name,
+                                    int32_t element_type, std::optional<gsl::span<const int64_t>> shape) {
+  auto& outputs = *op.mutable_outputs();
+  auto& output_arg = *outputs.Add();
+  output_arg.set_name(output_name.data(), output_name.size());
+
+  MILSpec::ValueType& value = *output_arg.mutable_type();
+  MILSpec::TensorType& tensor_type = *value.mutable_tensortype();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(element_type), shape, /*convert_scalar*/ true);
+}
+
 void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
                         std::optional<int32_t> override_element_type) {
   auto& outputs = *op.mutable_outputs();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index f012e6af0d718..475ce79b0a812 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -129,6 +129,26 @@ COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& n
 void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
                        std::string_view input_name, std::string_view value_name);
 
+/// <summary>
+/// Add a variadic input argument to a MILSpec::Operation
+/// </summary>
+/// <param name="op">Operation to update.</param>
+/// <param name="input name">The input name defined by the spec for the operation. </param>
+/// <param name="value_names">The input value names.</param>
+void AddOperationVariadicInput(COREML_SPEC::MILSpec::Operation& op, std::string_view input_name,
+                               const std::vector<std::string_view>& value_names);
+
+/// Add an output to a MILSpec::Operation for an intermediate operation when the implementation is composed of
+/// multiple MLProgram operations. In this case we don't have a NodeArg for the output.
+/// </summary>
+/// <param name="op">Operation to update.</param>
+/// <param name="output_name">Name of the intermediate output. Create using ModelBuilder::GetUniqueName.</param>
+/// <param name="element_type">onnx::TensorProto_DataType element type of the output.
+///   int32_t as that is what TensorShapeProto uses to store the value.</param>
+/// <param name="shape">Shape of the output if known.</param>
+void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, std::string_view output_name,
+                                    int32_t element_type, std::optional<gsl::span<const int64_t>> shape);
+
 /// <summary>
 /// Add an output to a MILSpec::Operation. Name, data type and shape are used from the NodeArg.
 /// </summary>
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index 34193318a0264..9ea0030290abd 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,27 +19,51 @@ class ConcatOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
-  layer->mutable_concat()->set_sequenceconcat(false);
-
-  for (const auto* input : node.InputDefs()) {
-    LOGS(logger, VERBOSE) << "input name " << input->Name();
-    *layer->mutable_input()->Add() = input->Name();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+
+    NodeAttrHelper helper(node);
+    const auto axis = helper.GetInt64("axis");  // required
+    const auto interleave = false;
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "concat");
+    std::vector<std::string_view> input_names;
+    for (const auto* input : node.InputDefs()) {
+      input_names.emplace_back(input->Name());
+    }
+    AddOperationVariadicInput(*op, "values", input_names);
+    AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", *axis));
+    AddOperationInput(*op, "interleave", model_builder.AddScalarConstant(op->type(), "interleave", interleave));
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
+  } else  // NOLINT
+#endif    // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    layer->mutable_concat()->set_sequenceconcat(false);
+
+    for (const auto* input : node.InputDefs()) {
+      LOGS(logger, VERBOSE) << "input name " << input->Name();
+      *layer->mutable_input()->Add() = input->Name();
+    }
+
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
   }
-
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
-bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */,
+bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() < 2) {
@@ -50,23 +75,25 @@ bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
   if (!GetShape(*input_defs[0], input_shape, logger))
     return false;
 
-  auto rank = input_shape.size();
-  if (rank != 4) {
-    // For some reason, the concat in CoreML running on 3d tensor will concat on wrong axis
-    // Instead of concat on axis 0, it will concat on axis 1
-    // Disable Concat support for 3d tensor for now
-    // TODO, add ExpandDims and Squeeze, 3d -ExpandDims-> 4d -> Concat -Squeeze-> 3d
-    LOGS(logger, VERBOSE) << "Concat only support 4d shape for now, input is "
-                          << rank << "d shape";
-    return false;
-  }
-
-  NodeAttrHelper helper(node);
-  auto axis = static_cast<size_t>(HandleNegativeAxis(helper.Get("axis", 1), rank));
-  if (rank != axis + 3) {
-    LOGS(logger, VERBOSE) << "Concat only support axis to be -3, actual axis: " << axis
-                          << ", actual rank: " << rank;
-    return false;
+  if (!input_params.create_mlprogram) {
+    auto rank = input_shape.size();
+    if (rank != 4) {
+      // For some reason, the concat in CoreML running on 3d tensor will concat on wrong axis
+      // Instead of concat on axis 0, it will concat on axis 1
+      // Disable Concat support for 3d tensor for now
+      // TODO: add ExpandDims and Squeeze, 3d -ExpandDims-> 4d -> Concat -Squeeze-> 3d
+      LOGS(logger, VERBOSE) << "Concat only support 4d shape for now, input is "
+                            << rank << "d shape";
+      return false;
+    }
+
+    NodeAttrHelper helper(node);
+    auto axis = static_cast<size_t>(HandleNegativeAxis(helper.Get("axis", 1), rank));
+    if (rank != axis + 3) {
+      LOGS(logger, VERBOSE) << "Concat only support axis to be -3, actual axis: " << axis
+                            << ", actual rank: " << rank;
+      return false;
+    }
   }
 
   return true;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index 1eba312b2577b..bec2461ffbc52 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/common/safeint.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,52 +19,133 @@ class DepthToSpaceOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                     const Node& node,
-                                                    const logging::Logger& /* logger */) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
+                                                    [[maybe_unused]] const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   const auto& output_defs = node.OutputDefs();
   const auto& input_name = input_defs[0]->Name();
-  const auto& output_name = output_defs[0]->Name();
 
-  uint64_t blocksize = SafeInt<uint64_t>(node.GetAttributes().at("blocksize").i());
+  NodeAttrHelper helper(node);
+  int64_t blocksize = *helper.GetInt64("blocksize");  // required attribute
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+
+    const auto mode = helper.Get("mode", "DCR");
+
+    if (mode == "DCR") {
+      // DCR is directly supported
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.depth_to_space
+      // Validated with depth_to_space.py.
+      auto op = model_builder.CreateOperation(node, "depth_to_space");
+      AddOperationInput(*op, "x", input_name);
+      AddOperationInput(*op, "block_size", model_builder.AddScalarConstant(op->type(), "blocksize", blocksize));
+      AddOperationOutput(*op, *output_defs[0]);
+      model_builder.AddOperation(std::move(op));
+    } else {
+      // CRD is manual. there may be a perf cost from the Reshape's (typically that happens on CPU) but if the input
+      // is a fixed size hopefully CoreML is smart enough to handle that aspect during model compilation instead
+      // of execution.
+
+      // https://github.com/onnx/onnx/blob/main/docs/Operators.md#depthtospace
+      // b, c, h, w = x.shape
+      // tmp = np.reshape(x, [b, c // (blocksize ** 2), blocksize, blocksize, h, w])
+      // tmp = np.transpose(tmp, [0, 1, 4, 2, 5, 3])
+      // y = np.reshape(tmp, [b, c // (blocksize ** 2), h * blocksize, w * blocksize])
+      //
+      // CoreML has a 5D limit, so we merge the batch dim into the channel dim as that doesn't change the data
+      // movement.
+      // First reshape is to [b * c // (blocksize ** 2), blocksize, blocksize, h, w]
+      // Transpose is to [0, 3, 1, 4, 2]
+
+      // we checked shape was static in IsOpSupportedImpl so this should never fail
+      std::vector<int64_t> input_shape;
+      ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Failed to get input shape");
+      const int32_t elem_type = static_cast<int32_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+
+      // reshape to [b * c // (blocksize ** 2), blocksize, blocksize, h, w]
+      auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre");
+      std::vector<int64_t> shape1 = {input_shape[0] * input_shape[1] / (blocksize * blocksize),
+                                     blocksize, blocksize, input_shape[2], input_shape[3]};
+      AddOperationInput(*reshape1, "x", input_name);
+      AddOperationInput(*reshape1, "shape", model_builder.AddConstant(reshape1->type(), "shape", shape1));
+      const auto& reshape1_output = model_builder.GetUniqueName(node, "reshape1");
+      AddIntermediateOperationOutput(*reshape1, reshape1_output, elem_type, shape1);
+
+      // transpose to [0, 3, 1, 4, 2]
+      auto transpose = model_builder.CreateOperation(node, "transpose");
+      std::vector<int64_t> perm = {0, 3, 1, 4, 2};
+      std::vector<int64_t> shape2 = {shape1[0], shape1[3], shape1[1], shape1[4], shape1[2]};
+      AddOperationInput(*transpose, "x", reshape1_output);
+      AddOperationInput(*transpose, "perm", model_builder.AddConstant(transpose->type(), "perm", perm));
+      const auto& transpose_output = model_builder.GetUniqueName(node, "transpose");
+      AddIntermediateOperationOutput(*transpose, transpose_output, elem_type, shape2);
+
+      // reshape to [b, c // (blocksize ** 2), h * blocksize, w * blocksize]
+      auto reshape2 = model_builder.CreateOperation(node, "reshape", "post");
+      std::vector<int64_t> shape3 = {input_shape[0],
+                                     input_shape[1] / (blocksize * blocksize),
+                                     input_shape[2] * blocksize,
+                                     input_shape[3] * blocksize};
+      AddOperationInput(*reshape2, "x", transpose_output);
+      AddOperationInput(*reshape2, "shape", model_builder.AddConstant(reshape2->type(), "shape", shape3));
+
+      AddOperationOutput(*reshape2, *output_defs[0]);
+
+      model_builder.AddOperation(std::move(reshape1));
+      model_builder.AddOperation(std::move(transpose));
+      model_builder.AddOperation(std::move(reshape2));
+    }
+  } else  // NOLINT
+#endif    // if defined(COREML_ENABLE_MLPROGRAM)
+  {
+    const auto& output_name = output_defs[0]->Name();
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
-  auto* coreml_depthtospace = layer->mutable_reorganizedata();
-  coreml_depthtospace->set_blocksize(blocksize);
-  coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType::
-                                    ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE);
+    auto* coreml_depthtospace = layer->mutable_reorganizedata();
+    coreml_depthtospace->set_blocksize(static_cast<uint64_t>(blocksize));
+    coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType::
+                                      ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE);
 
-  *layer->mutable_input()->Add() = input_name;
-  *layer->mutable_output()->Add() = output_name;
+    *layer->mutable_input()->Add() = input_name;
+    *layer->mutable_output()->Add() = output_name;
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
-bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
+bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                               const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
   std::vector<int64_t> input_shape;
   if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "DepthToSpace: no input shape";
     return false;
   }
 
-  const auto input_rank = input_shape.size();
-  if (input_rank < 4) {
-    LOGS(logger, VERBOSE) << "DepthToSpace does not support input shape of " << input_rank << "d shape.";
-  }
+  // ONNX and CoreML both require 4D input so no need to check the shape here.
 
   NodeAttrHelper helper(node);
-  if (node.SinceVersion() >= 11) {
-    // For now, only DCR mode DepthToSpace is supported
-    const auto mode = helper.Get("mode", "DCR");
+  const auto mode = helper.Get("mode", "DCR");
+
+  if (input_params.create_mlprogram) {
+    if (mode == "CRD" && !IsStaticShape(input_shape)) {
+      // we need to manually implement the logic with a Reshape, so we need to know the shape to do that
+      LOGS(logger, VERBOSE) << "DepthToSpace: CRD mode requires static shape";
+      return false;
+    }
+  } else {
     if (mode != "DCR") {
-      LOGS(logger, VERBOSE) << "The mode: " << mode << "of DepthToSpace is not supported in CoreML EP for now.";
+      LOGS(logger, VERBOSE) << "DepthToSpace: " << mode << " mode is not supported";
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
index bfc665e0ac716..9caec290ea5a2 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
@@ -19,8 +19,8 @@ std::string_view GetMode(const NodeAttrHelper& helper) {
   // opset 20+ uses linear, nearest, cubic
   // bilinear is what CoreML uses, so prefer that
   // bicubic/cubic isn't supported
-
-  const auto& mode = helper.Get("mode", "linear");
+  static const std::string default_mode = "linear";  // static in case we ever return the default as a string_view
+  const auto& mode = helper.Get("mode", default_mode);
   if (mode == "linear") {
     return "bilinear";
   }
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index 535712f096010..b0006b24e7d75 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -15,28 +15,28 @@ namespace coreml {
 static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   OpBuilderRegistrations op_registrations;
 
+  // Activations
+  CreateActivationOpBuilder("Sigmoid", op_registrations);
+  CreateActivationOpBuilder("Tanh", op_registrations);
+  CreateActivationOpBuilder("Relu", op_registrations);
+  CreateActivationOpBuilder("PRelu", op_registrations);
+  CreateActivationOpBuilder("LeakyRelu", op_registrations);
+
   // Unary ops
-  CreateUnaryOpBuilder("Sqrt", op_registrations);
   CreateUnaryOpBuilder("Reciprocal", op_registrations);
+  CreateUnaryOpBuilder("Sqrt", op_registrations);
 
   // Binary elementwise ops
   CreateBinaryOpBuilder("Add", op_registrations);
+  CreateBinaryOpBuilder("Div", op_registrations);
   CreateBinaryOpBuilder("Mul", op_registrations);
   CreateBinaryOpBuilder("Pow", op_registrations);
   CreateBinaryOpBuilder("Sub", op_registrations);
-  CreateBinaryOpBuilder("Div", op_registrations);
-
-  // Activations
-  CreateActivationOpBuilder("Sigmoid", op_registrations);
-  CreateActivationOpBuilder("Tanh", op_registrations);
-  CreateActivationOpBuilder("Relu", op_registrations);
-  CreateActivationOpBuilder("PRelu", op_registrations);
-  CreateActivationOpBuilder("LeakyRelu", op_registrations);
 
   // Pooling ops
+  CreatePoolOpBuilder("AveragePool", op_registrations);
   CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
   CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
-  CreatePoolOpBuilder("AveragePool", op_registrations);
   CreatePoolOpBuilder("MaxPool", op_registrations);
 
   // Reduction ops
@@ -54,6 +54,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateFlattenOpBuilder("Flatten", op_registrations);
   CreateGatherOpBuilder("Gather", op_registrations);
   CreateGemmOpBuilder("Gemm", op_registrations);
+  CreateGridSampleOpBuilder("GridSample", op_registrations);
   CreateLRNOpBuilder("LRN", op_registrations);
   CreateGemmOpBuilder("MatMul", op_registrations);
   CreatePadOpBuilder("Pad", op_registrations);
@@ -66,8 +67,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateSqueezeOpBuilder("Squeeze", op_registrations);
   CreateTransposeOpBuilder("Transpose", op_registrations);
 
-  CreateGridSampleOpBuilder("GridSample", op_registrations);
-
   return op_registrations;
 }
 
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index a92fef81ac395..f2cd4d01174d3 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -83,7 +83,9 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie
       };
 
   result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {},
-                                            gen_metadef_name, COREML, kCoreMLExecutionProvider);
+                                            gen_metadef_name, COREML, kCoreMLExecutionProvider,
+                                            nullptr,
+                                            /*drop_constant_initializers*/ true);
 
   const auto num_of_partitions = result.size();
   const auto num_of_supported_nodes = std::transform_reduce(
diff --git a/onnxruntime/core/providers/coreml/DebugMLProgram.md b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md
similarity index 97%
rename from onnxruntime/core/providers/coreml/DebugMLProgram.md
rename to onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md
index e41a515594303..b7a54466ab8dd 100644
--- a/onnxruntime/core/providers/coreml/DebugMLProgram.md
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md
@@ -25,6 +25,8 @@ https://apple.github.io/coremltools/docs-guides/source/model-intermediate-langua
 Usage is reasonably intuitive. The below example defines a model with 2 inputs and a matmul operator.
 The model is printed, and run with randomly generated inputs. The output from doing so is printed.
 
+There are additional test scripts in this directory for different operators.
+
 ```python
 import numpy as np
 import coremltools as ct
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py
new file mode 100644
index 0000000000000..430a2b3fa3ed0
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py
@@ -0,0 +1,33 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+
+target = ct.target.iOS15
+
+a_shape = (1, 1, 3, 3)
+
+
+@mb.program(
+    input_specs=[mb.TensorSpec(shape=a_shape), mb.TensorSpec(shape=a_shape), mb.TensorSpec(shape=a_shape)],
+    opset_version=target,
+)
+def prog(x, y, z):
+    axis = mb.const(val=1)
+    interleave = mb.const(val=False)
+    z = mb.concat(values=(x, y, z), axis=axis, interleave=interleave)
+    return z
+
+
+print(prog)
+
+# Convert to ML program
+m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32)
+
+x = np.random.rand(*a_shape)
+y = np.random.rand(*a_shape)
+z = np.random.rand(*a_shape)
+
+# spec = m.get_spec()
+# print(spec)
+
+print(m.predict({"x": x, "y": y, "z": z}))
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py
new file mode 100644
index 0000000000000..2c8cbc4948a6b
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py
@@ -0,0 +1,42 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+
+target = ct.target.iOS15
+
+x_shape = (1, 3, 4, 4)
+w_shape = (3, 3, 3, 3)
+
+
+@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target)
+def prog(x):
+    weight = mb.const(name="weight", val=np.ones(w_shape, dtype=np.float32))
+    output_shape = mb.const(name="output_shape", val=np.array([1, 3, 4, 4]))
+    # pad = mb.const(val=np.zeros((4), dtype=np.int32))
+    strides = mb.const(name="strides", val=np.ones((2), dtype=np.int32))
+    dilations = mb.const(name="dilations", val=np.ones((2), dtype=np.int32))
+    z = mb.conv_transpose(
+        x=x, weight=weight, strides=strides, dilations=dilations, output_shape=output_shape
+    )  # , pad=pad
+
+    return z
+
+
+print(prog)
+
+# Convert to ML program
+m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32)
+
+# spec = m.get_spec()
+# print(spec)
+
+m.save("ConvTranspose.mlpackage")
+# construct MLModel with compute_units=ComputeUnit.CPU and run predict
+m_cpu = ct.models.MLModel("ConvTranspose.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY)
+m_all = ct.models.MLModel("ConvTranspose.mlpackage", compute_units=ct.ComputeUnit.ALL)
+
+x = np.ones(x_shape, dtype=np.float32)
+print("CPU_ONLY")
+print(m_cpu.predict({"x": x}))
+print("ALL")
+print(m_all.predict({"x": x}))
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py
new file mode 100644
index 0000000000000..593d9e8bbf66a
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py
@@ -0,0 +1,51 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+
+target = ct.target.iOS15
+
+# replicate example from https://github.com/onnx/onnx/blob/main/docs/Operators.md#depthtospace
+# to prove CoreML mode is DCR
+x_shape = (1, 8, 2, 3)
+
+
+@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target)
+def prog(x):
+    block_size = mb.const(name="block_size", val=2)
+    z = mb.depth_to_space(x=x, block_size=block_size)
+    return z
+
+
+print(prog)
+
+# Convert to ML program
+m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32)
+
+# spec = m.get_spec()
+# print(spec)
+
+m.save("DepthToSpace.mlpackage")
+
+# also check for differences between CPU_ONLY and ALL
+m_cpu = ct.models.MLModel("DepthToSpace.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY)
+m_all = ct.models.MLModel("DepthToSpace.mlpackage", compute_units=ct.ComputeUnit.ALL)
+
+x = np.array(
+    [
+        [
+            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
+            [[9.0, 10.0, 11.0], [12.0, 13.0, 14.0]],
+            [[18.0, 19.0, 20.0], [21.0, 22.0, 23.0]],
+            [[27.0, 28.0, 29.0], [30.0, 31.0, 32.0]],
+            [[36.0, 37.0, 38.0], [39.0, 40.0, 41.0]],
+            [[45.0, 46.0, 47.0], [48.0, 49.0, 50.0]],
+            [[54.0, 55.0, 56.0], [57.0, 58.0, 59.0]],
+            [[63.0, 64.0, 65.0], [66.0, 67.0, 68.0]],
+        ]
+    ]
+).astype(np.float32)
+
+print("CPU_ONLY")
+print(m_cpu.predict({"x": x}))
+print("ALL")
+print(m_all.predict({"x": x}))
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py
new file mode 100644
index 0000000000000..a0423511598ff
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py
@@ -0,0 +1,103 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+from coremltools.models import datatypes
+from coremltools.models.neural_network import NeuralNetworkBuilder
+from coremltools.models.utils import save_spec
+
+input_dim = (1,)
+output_dim = (1,)
+
+
+def mlprogram():
+    target = ct.target.iOS15
+
+    @mb.program(input_specs=[mb.TensorSpec(shape=input_dim), mb.TensorSpec(shape=input_dim)], opset_version=target)
+    def prog(x, y):
+        return mb.real_div(x=x, y=y)
+
+    # print(prog)
+
+    # Convert to ML program
+    m = ct.convert(prog, minimum_deployment_target=target)
+
+    x = np.array([2], dtype=np.float32)
+    y = np.array([2047], dtype=np.float32)
+
+    # spec = m.get_spec()
+    # print(spec)
+
+    print(m.predict({"x": x, "y": y}))
+
+
+# implement Div with coremltools approach of x * (1/y)
+def nn():
+    input_features = [("x", datatypes.Array(*input_dim)), ("y_inv", datatypes.Array(*input_dim))]
+    output_features = [("final", datatypes.Array(*output_dim))]
+
+    # Build a simple neural network with 1 inner product layer
+    builder = NeuralNetworkBuilder(input_features, output_features)
+    builder.add_elementwise(
+        name="x_multiply_inverse_of_y",
+        input_names=["x", "y_inv"],
+        output_name="final",
+        mode="MULTIPLY",
+    )
+
+    save_spec(builder.spec, "network.mlmodel")
+    m = ct.models.MLModel("network.mlmodel")
+
+    x = np.array([2], dtype=np.float32)
+    y = np.array([1 / 2047], dtype=np.float32)
+    print(m.predict({"x": x, "y_inv": y}))
+
+
+def nn_scale():
+    input_features = [
+        ("x", datatypes.Array(*input_dim)),
+        ("y_inv", datatypes.Array(*input_dim)),
+        ("z", datatypes.Array(*input_dim)),
+    ]
+    output_features = [("final", datatypes.Array(*output_dim))]
+
+    builder = NeuralNetworkBuilder(input_features, output_features)
+
+    builder.add_elementwise(
+        name="div_implemented_as_x_multiply_inverse_of_y",
+        input_names=["x", "y_inv"],
+        output_name="div_result",
+        mode="MULTIPLY",
+    )
+
+    builder.add_elementwise(
+        name="apply_scaling_factor",
+        input_names=["div_result", "z"],
+        output_name="final",
+        mode="MULTIPLY",
+    )
+
+    from coremltools.models.utils import save_spec
+
+    save_spec(builder.spec, "network.mlmodel")
+    m = ct.models.MLModel("network.mlmodel")
+
+    a = 2
+    b = 2047
+    # scaling factor to test working around coremltools inaccuracy.
+    # weirdly even a scaling factor of 1 fixes the problem from https://github.com/microsoft/onnxruntime/issues/21170
+    c = 1000
+
+    x = np.array([a], dtype=np.float32)
+    y = np.array([1 / b / c], dtype=np.float32)
+    z = np.array([c], dtype=np.float32)
+    print(m.predict({"x": x, "y_inv": y, "z": z}))
+
+
+print("NN")
+nn()
+
+print("\nNN with scaling")
+nn_scale()
+
+print("\nML Program")
+mlprogram()
diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/dump_mlprogram_model.py
similarity index 100%
rename from onnxruntime/core/providers/coreml/dump_mlprogram_model.py
rename to onnxruntime/core/providers/coreml/mlprogram_test_scripts/dump_mlprogram_model.py
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py
new file mode 100644
index 0000000000000..5ce79c204c00c
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py
@@ -0,0 +1,114 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+
+target = ct.target.iOS15
+
+x_shape = (2, 2, 3, 2)
+grid_shape = (2, 3, 2, 2)
+
+
+@mb.program(input_specs=[mb.TensorSpec(shape=x_shape), mb.TensorSpec(shape=grid_shape)], opset_version=target)
+def prog(x, grid):
+    sampling = mb.const(name="sampling_mode", val="bilinear")
+    padding_mode = mb.const(name="pmode", val="reflection")
+    pad = mb.const(name="pval", val=np.float32(0))
+    coord_mode = mb.const(name="coord_mode", val="normalized_minus_one_to_one")
+    align_corners = mb.const(name="align_corners", val=False)
+    z = mb.resample(
+        x=x,
+        coordinates=grid,
+        sampling_mode=sampling,
+        padding_mode=padding_mode,
+        padding_value=pad,
+        coordinates_mode=coord_mode,
+        align_corners=align_corners,
+    )
+
+    return z
+
+
+# print(prog)
+
+# Convert to ML program
+m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32)
+
+# spec = m.get_spec()
+# print(spec)
+
+m.save("GridSample.mlpackage")
+# construct MLModel with compute_units=ComputeUnit.CPU and run predict
+m_cpu = ct.models.MLModel("GridSample.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY)
+m_all = ct.models.MLModel("GridSample.mlpackage", compute_units=ct.ComputeUnit.ALL)
+
+# GridSampleTest.test_grid_sample_20_4D_bilinear_reflection_no_align_corners
+# ORT produces different output for this test. ORT output is generated by pytorch
+x = (
+    np.array(
+        [
+            -0.173652,
+            -1.513725,
+            -0.704586,
+            -1.952375,
+            -0.699404,
+            -0.806298,
+            1.640852,
+            -0.138969,
+            -0.695411,
+            -1.352111,
+            0.568797,
+            -0.564294,
+            -0.056468,
+            0.641604,
+            -0.438370,
+            0.450167,
+            -1.091401,
+            1.669729,
+            -0.908544,
+            0.244467,
+            0.172109,
+            1.156741,
+            -0.617128,
+            1.155460,
+        ]
+    )
+    .astype(np.float32)
+    .reshape(x_shape)
+)
+
+grid = (
+    np.array(
+        [
+            0.252250,
+            -0.151452,
+            0.824706,
+            -0.588292,
+            -0.591147,
+            -0.155082,
+            -0.732938,
+            0.457493,
+            -0.439559,
+            0.492330,
+            0.696447,
+            0.700722,
+            -0.220298,
+            0.654884,
+            -0.635434,
+            -1.195619,
+            -0.114204,
+            -0.870080,
+            -0.929674,
+            0.305035,
+            1.025429,
+            -0.472240,
+            -0.067881,
+            -0.869393,
+        ]
+    )
+    .astype(np.float32)
+    .reshape(grid_shape)
+)
+
+
+print(m_cpu.predict({"x": x, "grid": grid}))
+print(m_all.predict({"x": x, "grid": grid}))
diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py
new file mode 100644
index 0000000000000..f83dc6ddfe02f
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py
@@ -0,0 +1,51 @@
+import coremltools as ct
+import numpy as np
+from coremltools.converters.mil import Builder as mb
+
+target = ct.target.iOS15
+
+x_shape = (1, 1, 3, 6)
+
+use_scale = False  # set this to test upsample vs resize
+
+
+@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target)
+def prog(x):
+    global use_scale  # noqa
+
+    if use_scale:
+        align = mb.const(val=False)
+        scale_h = mb.const(val=float(1 / 3))
+        scale_w = mb.const(val=float(1 / 3))
+        z = mb.upsample_bilinear(x=x, scale_factor_height=scale_h, scale_factor_width=scale_w, align_corners=align)
+    else:
+        size_h = mb.const(val=1)
+        size_w = mb.const(val=2)
+        sampling_mode = mb.const(val="UNALIGN_CORNERS")
+        z = mb.resize_bilinear(x=x, target_size_height=size_h, target_size_width=size_w, sampling_mode=sampling_mode)
+
+    return z
+
+
+print(prog)
+
+# Convert to ML program
+m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32)
+
+x = np.array(
+    [
+        [
+            [
+                [1, 2, 3, 4, 5, 6],
+                [7, 8, 9, 10, 11, 12],
+                [13, 14, 15, 16, 17, 18],
+            ]
+        ]
+    ],
+    dtype=np.float32,
+)
+
+# spec = m.get_spec()
+# print(spec)
+
+print(m.predict({"x": x}))
diff --git a/onnxruntime/core/providers/partitioning_utils.cc b/onnxruntime/core/providers/partitioning_utils.cc
index c45f5cd0848dd..83c08f3dbd25e 100644
--- a/onnxruntime/core/providers/partitioning_utils.cc
+++ b/onnxruntime/core/providers/partitioning_utils.cc
@@ -88,8 +88,6 @@ It is required to ensure we do not break up a QDQ node unit during partitioning.
 @param graph_viewer GraphViewer that IExecutionProvider::GetCapability is called with.
 @param is_node_supported_fn Callback to check whether a node is supported.
 @param on_group_closed_fn Callback to indicate a completed partition node group.
-@param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
-                    No-op in a release build.
 @return The partition node groups.
 */
 std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
@@ -97,12 +95,7 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
     const IsNodeSupportedFn& is_node_supported_fn,
     const OnGroupClosedFn& on_group_closed_fn,
     const std::string& execution_provider_type,
-    const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
-    bool debug_output) {
-#ifdef NDEBUG
-  ORT_UNUSED_PARAMETER(debug_output);
-#endif
-
+    const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map) {
   ORT_ENFORCE(is_node_supported_fn, "Node support test is required.");
 
   /*
@@ -146,12 +139,10 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
   auto close_group = [&]() {
     if (!supported_group.empty()) {
 #ifndef NDEBUG
-      if (debug_output) {
-        LOGS_DEFAULT(VERBOSE) << "New partition node group.\n"
-                              << "Unsupported nodes on group border: "
-                              << NodeGroupDebugString(nodes_to_process_with_next_group, true) << "\n"
-                              << "Nodes in group: " << NodeGroupDebugString(supported_group);
-      }
+      LOGS_DEFAULT(VERBOSE) << "New partition node group.\n"
+                            << "Unsupported nodes on group border: "
+                            << NodeGroupDebugString(nodes_to_process_with_next_group, true) << "\n"
+                            << "Nodes in group: " << NodeGroupDebugString(supported_group);
 #endif
 
       // if no on_group_closed_fn callback was given, keep the partition
@@ -163,7 +154,7 @@ std::vector<std::vector<const Node*>> CreateSupportedPartitionNodeGroups(
       }
 #ifndef NDEBUG
       else {
-        LOGS_DEFAULT_IF(debug_output, VERBOSE) << "Discarded partition node group.";
+        LOGS_DEFAULT(VERBOSE) << "Discarded partition node group.";
       }
 #endif
 
@@ -291,7 +282,8 @@ InlinedHashSet<const Node*> CreateExcludedNodeSet(const GraphViewer& graph_viewe
 std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
                                                          const std::vector<const Node*>& group,
                                                          const GenerateMetadefNameFn& generate_metadef_name,
-                                                         const std::string& execution_provider_name) {
+                                                         const std::string& execution_provider_name,
+                                                         bool drop_constant_initializers) {
   std::unordered_set<const Node*> node_set;
   node_set.reserve(group.size());
   node_set.insert(group.cbegin(), group.cend());
@@ -354,6 +346,10 @@ std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& grap
   meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
 
   for (const auto& input : ordered_subgraph_inputs) {
+    if (drop_constant_initializers && graph_viewer.IsConstantInitializer(input->Name(), true)) {
+      continue;
+    }
+
     meta_def->inputs.push_back(input->Name());
   }
 
@@ -374,13 +370,12 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
                           const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
-                          bool debug_output) {
+                          bool drop_constant_initializers) {
   const auto groups = CreateSupportedPartitionNodeGroups(graph_viewer,
                                                          is_node_supported_fn,
                                                          on_partition_closed_fn,
                                                          execution_provider_type,
-                                                         node_unit_map,
-                                                         debug_output);
+                                                         node_unit_map);
 
   std::vector<std::unique_ptr<ComputeCapability>> partitions{};
   partitions.reserve(groups.size());
@@ -390,7 +385,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
       std::back_inserter(partitions),
       [&](const auto& supported_partition) {
         return MakeComputeCapability(graph_viewer, supported_partition, generate_metadef_name_fn,
-                                     execution_provider_name);
+                                     execution_provider_name, drop_constant_initializers);
       });
 
   return partitions;
@@ -404,7 +399,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
                           const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
-                          bool debug_output) {
+                          bool drop_constant_initializers) {
   const auto excluded_nodes = CreateExcludedNodeSet(graph_viewer, stop_ops);
   const bool check_excluded_nodes = !excluded_nodes.empty();
 
@@ -419,7 +414,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
       execution_provider_name,
       execution_provider_type,
       node_unit_map,
-      debug_output);
+      drop_constant_initializers);
 }
 
 }  // namespace utils
diff --git a/onnxruntime/core/providers/partitioning_utils.h b/onnxruntime/core/providers/partitioning_utils.h
index c3f6b104e3f6a..235a88cfdb8a5 100644
--- a/onnxruntime/core/providers/partitioning_utils.h
+++ b/onnxruntime/core/providers/partitioning_utils.h
@@ -62,9 +62,10 @@ Create the supported partitions for the execution provider.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
 @param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
                      Should be created by EP calling GetAllNodeUnits.
-@param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
-                    No-op in a release build.
-
+@param drop_constant_initializer Drop constant initializers from input to a ComputeCapability.
+                                 Set to true if constant initializers have been copied into a compiled model to allow
+                                 ORT to free the initializer. If the initializer remains as an input it will appear to
+                                 still be in-use.
 @returns ComputeCapability instances for all partitions assigned to the execution provider.
 */
 std::vector<std::unique_ptr<ComputeCapability>>
@@ -74,8 +75,8 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name_fn,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
-                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
-                          bool debug_output = false);
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false);
 
 /**
 Create the supported partitions for the execution provider.
@@ -88,9 +89,10 @@ Create the supported partitions for the execution provider.
 @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance.
 @param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models.
                      Should be created by EP calling GetAllNodeUnits.
-@param debug_output Print diagnostic output about the partitions and reasons for partition breaks.
-                    No-op in a release build.
-
+@param drop_constant_initializer Drop constant initializers from input to a ComputeCapability.
+                                 Set to true if constant initializers have been copied into a compiled model to allow
+                                 ORT to free the initializer. If the initializer remains as an input it will appear to
+                                 still be in-use.
 @returns ComputeCapability instances for all partitions assigned to the execution provider.
 */
 std::vector<std::unique_ptr<ComputeCapability>>
@@ -100,8 +102,8 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer,
                           const GenerateMetadefNameFn& generate_metadef_name,
                           const std::string& execution_provider_name,
                           const std::string& execution_provider_type,
-                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map = nullptr,
-                          bool debug_output = false);
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false);
 
 /**
 Create a ComputeCapability instance from the group of nodes.
@@ -120,7 +122,8 @@ Will automatically determine the inputs and outputs required.
 std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
                                                          const std::vector<const Node*>& group,
                                                          const GenerateMetadefNameFn& generate_metadef_name,
-                                                         const std::string& execution_provider_name);
+                                                         const std::string& execution_provider_name,
+                                                         bool drop_constant_initializers);
 
 /**
 Create the set of nodes to exclude based on a set of stop ops.
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 0ddaa97694217..539b456cb657f 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -565,7 +565,8 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
       supported_groups.begin(), supported_groups.end(),
       std::back_inserter(result),
       [&](const auto& supported_partition) {
-        return utils::MakeComputeCapability(graph_viewer, supported_partition, gen_metadef_name, QNN);
+        return utils::MakeComputeCapability(graph_viewer, supported_partition, gen_metadef_name, QNN,
+                                            /*drop_constant_initializers*/ false);  // TODO: could this be set to true?
       });
 
   const size_t num_of_partitions = result.size();
@@ -660,7 +661,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   // Create partitions from supported nodes.
   std::vector<std::unique_ptr<ComputeCapability>> partitions = utils::CreateSupportedPartitions(
-      graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map, true);
+      graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map);
 
   // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node.
   // We also count the number of supported nodes in all valid partitions.
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index cc3a9943ca0a3..5ad2f08467792 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1603,6 +1603,11 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                             logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  // a compiling EP (e.g. CoreML) may copy initializers to its own memory. run the cleanup of unused initializers
+  // so that they can be freed.
+  ORT_RETURN_IF_ERROR(graph.RemovedUnusedInitializersOrtFormat());
+#endif
   return Status::OK();
 }
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 679ccce7fb07a..ffcd339c0ca3a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -40,6 +40,10 @@
 #include <cudnn.h>  // for CUDNN_MAJOR
 #endif
 
+#if defined(USE_COREML)
+#include "core/providers/coreml/coreml_provider_factory.h"
+#endif
+
 #include <pybind11/functional.h>
 
 // Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct,
@@ -1161,7 +1165,30 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #if !defined(__APPLE__)
     LOGS_DEFAULT(WARNING) << "CoreML execution provider can only be used to generate ORT format model in this build.";
 #endif
-    return onnxruntime::CoreMLProviderFactoryCreator::Create(0)->CreateProvider();
+    uint32_t coreml_flags = 0;
+
+    const auto it = provider_options_map.find(type);
+    if (it != provider_options_map.end()) {
+      const ProviderOptions& options = it->second;
+      auto flags = options.find("flags");
+      if (flags != options.end()) {
+        const auto& flags_str = flags->second;
+
+        if (flags_str.find("COREML_FLAG_USE_CPU_ONLY") != std::string::npos) {
+          coreml_flags |= COREMLFlags::COREML_FLAG_USE_CPU_ONLY;
+        }
+
+        if (flags_str.find("COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES") != std::string::npos) {
+          coreml_flags |= COREMLFlags::COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES;
+        }
+
+        if (flags_str.find("COREML_FLAG_CREATE_MLPROGRAM") != std::string::npos) {
+          coreml_flags |= COREMLFlags::COREML_FLAG_CREATE_MLPROGRAM;
+        }
+      }
+    }
+
+    return onnxruntime::CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider();
 #endif
   } else if (type == kXnnpackExecutionProvider) {
 #if defined(USE_XNNPACK)
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index fb85eb4c29bb6..367b4a65e3b7b 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -3693,7 +3693,8 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) {
     const auto compute_capability = utils::MakeComputeCapability(
         whole_graph_viewer, nodes,
         []() { return "sub_graph"; },
-        "Test Provider");
+        "Test Provider",
+        /*drop_constant_initializers*/ false);
 
     const GraphViewer partial_graph_viewer(graph, *compute_capability->sub_graph);
     ASSERT_EQ(3, partial_graph_viewer.NumberOfNodes());
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 5222380d9ca56..a0c1d675f506f 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -373,5 +373,36 @@ TEST(TensorOpTest, DepthToSpaceTest_5) {
   test.Run();
 }
 
+TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) {
+  OpTester test("DepthToSpace", 11);  // create an opset 11 model with attribute present = "CRD" mode
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 2, C = 4, H = 2, W = 3;
+  std::vector<float> X = {0., 1., 2.,
+                          3., 4., 5.,
+                          9., 10., 11.,
+                          12., 13., 14.,
+                          18., 19., 20.,
+                          21., 22., 23.,
+                          27., 28., 29.,
+                          30., 31., 32.};
+
+  // append same data but in reverse order so we can tell if the batch output is wrong
+  X.insert(X.end(), X.rbegin(), X.rend());
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0., 9., 1., 10., 2., 11.,
+                               18., 27., 19., 28., 20., 29.,
+                               3., 12., 4., 13., 5., 14.,
+                               21., 30., 22., 31., 23., 32.};
+  result.insert(result.end(), result.rbegin(), result.rend());
+
+  test.AddOutput<float>("output", {2, 1, 4, 6}, result);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 5609033fc3e35..d2a961f17bd6a 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -6,13 +6,16 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Add||
 |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Clip||
+|ai.onnx:Concat||
 |ai.onnx:Conv|Only 1D/2D Conv is supported.<br/>Bias if provided must be constant.|
 |ai.onnx:ConvTranspose|Weight and bias must be constant.<br/>padding_type of SAME_UPPER/SAME_LOWER is not supported.<br/>kernel_shape must have default values.<br/>output_shape is not supported.<br/>output_padding must have default values.|
+|ai.onnx.DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.|
 |ai.onnx:Div||
 |ai.onnx:Gemm|Input B must be constant.|
 |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:GlobalMaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:GridSample|4D input.<br/>'mode' of 'linear' or 'zeros'.<br/>(mode==linear && padding_mode==reflection && align_corners==0) is not supported.|
+|ai.onnx.LeakyRelu||
 |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.|
 |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.|
 |ai.onnx:Mul||
@@ -24,4 +27,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||
-|ai:onnx:Transpose||
+|ai.onnx:Transpose||

From c464ab3acabfd276ca545db7eb364316e0158067 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Thu, 25 Jul 2024 15:57:30 -0700
Subject: [PATCH 23/57] Allow cpplint to always be green (#21491)

Allow cpplint to always be green since it is optional. Also changed the
workflow name to reflect that.
---
 .github/workflows/lint.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 3965fe063b148..2edbe2d814533 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -73,7 +73,7 @@ jobs:
           checkout_path: ${{ github.workspace }}
 
   lint-cpp:
-    name: Lint C++
+    name: Optional Lint C++
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@master
@@ -89,10 +89,11 @@ jobs:
       - name: Generate ONNX protobuf files
         run: cmake --build build/Debug --config Debug --target onnx_proto
       - uses: reviewdog/action-cpplint@master
+        continue-on-error: true
         with:
           github_token: ${{ secrets.github_token }}
           reporter: github-pr-check
-          level: warning
+          level: info
           flags: --linelength=120
             --exclude=java/src/main/native/*.c
             --exclude=onnxruntime/core/mlas/inc/*

From e5302b23c43b690592a818da95b4a31059e59e9e Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 26 Jul 2024 10:00:28 +1000
Subject: [PATCH 24/57] Fix SkipLayerNormFusion incorrectly setting modified
 every time it runs (#21502)

### Description
<!-- Describe your changes. -->
Current behavior forces all L2 optimizers to loop until they hit the max
number of iterations.

Only update modified if the graph was modified.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix unnecessary loops of L2 optimizers during model loading.
---
 onnxruntime/core/optimizer/skip_layer_norm_fusion.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
index cf70a7d821d72..655364357999a 100644
--- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
@@ -168,7 +168,8 @@ Note: This fusion doesn't consider the following case:
      LayerNormalization
 */
 
-Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
+                                      const logging::Logger& logger) const {
   GraphViewer graph_viewer(graph);
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
   InlinedVector<std::reference_wrapper<Node>> nodes_to_remove;
@@ -299,12 +300,15 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
     // Assign provider to this new node. Provider should be same as the provider for old node.
     skip_layer_norm_node.SetExecutionProviderType(ln_node.GetExecutionProviderType());
   }
+
   for (const auto& node : nodes_to_remove) {
     graph_utils::RemoveNodeOutputEdges(graph, node);
     graph.RemoveNode(node.get().Index());
   }
 
-  modified = true;
+  if (!nodes_to_remove.empty()) {
+    modified = true;
+  }
 
   return Status::OK();
 }

From 166809425ed3179ca66ba18383ab4664cdc33cde Mon Sep 17 00:00:00 2001
From: aamajumder <150728138+aamajumder@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:06:30 -0700
Subject: [PATCH 25/57] [DML EP] Register ReduceMin-20 (#20477)

### Description
This PR registers the ReduceMin-20 operator to the DML EP.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/OperatorKernels.md                                      | 3 ++-
 .../src/Operators/OperatorRegistration.cpp                   | 1 +
 .../providers/dml/OperatorAuthorHelper/OperatorVersions.h    | 1 +
 .../test/testdata/onnx_backend_test_series_filters.jsonc     | 5 ++++-
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index ed944b5a6df79..211c53d0fecc8 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1178,7 +1178,8 @@ Do not modify directly.*
 |||13+|**T** = tensor(float), tensor(float16)|
 |||11+|**T** = tensor(float), tensor(float16)|
 |||1+|**T** = tensor(float), tensor(float16)|
-|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
+|ReduceMin|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* reduced:**T**<br><br>or<br><br>*in* data:**T**<br> *out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
 |||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 27605a6ad8e8c..cf8f0a4b2db83 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -977,6 +977,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(     12,  ReduceMin,                          typeNameListDefault,            supportedTypeListFloat16to32Ints8to64,  DmlGraphSupport::Supported)},
     {REG_INFO(     13,  ReduceMin,                          typeNameListDefault,            supportedTypeListFloat16to32Ints8to64,  DmlGraphSupport::Supported)},
     {REG_INFO(     18,  ReduceMin,                          typeNameListDefault,            supportedTypeListFloat16to32Ints32to64, DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
+    {REG_INFO(     20,  ReduceMin,                          typeNameListDefault,            supportedTypeListAllScalars,            DmlGraphSupport::Supported,     requiredConstantCpuInputs(1))},
     {REG_INFO(      7,  ArgMax,                             typeNameListDefault,            supportedTypeListArgMinMax,             DmlGraphSupport::Supported)},
     {REG_INFO(     11,  ArgMax,                             typeNameListDefault,            supportedTypeListArgMinMax,             DmlGraphSupport::Supported)},
     {REG_INFO(     12,  ArgMax,                             typeNameListDefault,            supportedTypeListArgMinMax,             DmlGraphSupport::Supported)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index cd188761b22f7..f45c2b08db94d 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -434,6 +434,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_IsNaN = 20;
         static const int sc_sinceVer_IsInf = 20;
         static const int sc_sinceVer_ReduceMax = 20;
+        static const int sc_sinceVer_ReduceMin = 20;
     }
 
     namespace MsftOperatorSet1
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 1885a213bdf32..4b14d50127aa9 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -720,7 +720,10 @@
         "^test_constantofshape_int_zeros",
         "^test_reduce_log_sum_empty_set_cpu",
         "^test_reduce_log_sum_exp_empty_set_cpu",
-        "^test_reduce_prod_empty_set_cpu"
+        "^test_reduce_prod_empty_set_cpu",
+        //Bug: DML EP does not execute operators with an empty input tensor
+        //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides
+        "^test_reduce_min_empty_set_cpu"
     ],
     // ORT first supported opset 7, so models with nodes that require versions prior to opset 7 are not supported
     "tests_with_pre_opset7_dependencies": [

From b6b29309a529b28e94787edc359f75c9a2207486 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Fri, 26 Jul 2024 08:07:01 +0800
Subject: [PATCH 26/57] [WebNN EP] Update argMax/argMin to adapt to latest spec
 (#21452)

WebNN spec recently changes the definition of argMax/argMin:
- Remove selectLastIndex option, let backends decide to select the last
index or not.
- Move axes option to axis input
---
 js/web/docs/webnn-operators.md                |  4 ++--
 .../builders/impl/argmax_min_op_builder.cc    | 23 +++----------------
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 8d077846fa6a4..75652899b5e5e 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -13,8 +13,8 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 |:------:|:------:|:------:|:-:|:-:|:------|
 | Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | |
 | Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | |
-| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 |
-| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 |
+| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | |
+| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | |
 | AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
 | BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✓ | ✓ | Only supports 'training_mode' value is 0, one output |
 | Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✓ | ✓ | WebNN CPU backend doesn't support casting to uint64 data type |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 1330a3e354871..1ae63a644a287 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -40,28 +40,20 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   int64_t axis = helper.Get("axis", 0);
   const auto keep_dims = helper.Get("keepdims", 1);
-  const auto select_last_index = helper.Get("select_last_index", 0);
 
   axis = HandleNegativeAxis(axis, input_rank);
-  emscripten::val axes = emscripten::val::array();
-  axes.call<void>("push", static_cast<uint32_t>(axis));
 
   emscripten::val options = emscripten::val::object();
-  options.set("axes", axes);
   options.set("keepDimensions", keep_dims == 1);
-  options.set("selectLastIndex", select_last_index == 1);
-  // TODO: use WebNN's opSupportLimits API to check the backend's supported output data types.
-  // If the backend doesn't support int64 output, we should use default int32 output data type
-  // then do a type casting (int32 -> int64) for the output. Refer to the CoreML EP for how to
-  // support int64 output.
+  // TODO(Honry): check whether int64 output data type is supported by WebNN opSupportLimits() API.
   options.set("outputDataType", "int64");
   emscripten::val output = emscripten::val::object();
 
   const auto& op_type = node.OpType();
   if (op_type == "ArgMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, narrow<uint32_t>(axis), options);
   } else if (op_type == "ArgMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, narrow<uint32_t>(axis), options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type);
   }
@@ -81,15 +73,6 @@ bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia
   if (!GetShape(*input_defs[0], input_shape, logger))
     return false;
 
-  // WebNN CPU backend only supports select_last_index = 0.
-  if (device_type == WebnnDeviceType::CPU) {
-    NodeAttrHelper helper(node);
-    const auto select_last_index = helper.Get("select_last_index", 0);
-    if (select_last_index) {
-      LOGS(logger, VERBOSE) << "ArgMax/ArgMin with select_last_index = 1 is not supported on WebNN CPU backend.";
-      return false;
-    }
-  }
   return true;
 }
 

From 278f0f5cd2ab18abb48f381fc3fee127ebe960d2 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Fri, 26 Jul 2024 02:23:35 -0700
Subject: [PATCH 27/57] disables qnn in ort training cpu pipeline (#21510)

### Description
<!-- Describe your changes. -->

`enable_windows_arm64_qnn` and `enable_windows_x64_qnn` are true by
default but unnecessary for training. This change explicitly sets these
parameters to false for training pipeline.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

ORT 1.19 Release Preparation
---
 .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 5fa80bf7ff6d5..1fa88318b8c03 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -22,3 +22,5 @@ stages:
     enable_windows_gpu: false
     enable_mac_cpu: true
     enable_linux_arm: false
+    enable_windows_arm64_qnn: false
+    enable_windows_x64_qnn: false

From bbbaef3fa60f1ef1920af5104459251b42390135 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Fri, 26 Jul 2024 08:46:54 -0700
Subject: [PATCH 28/57] Update text formatting in generate_cgmanifest.py
 (#21489)

The only place where I manually fixed I forgot a format string
---
 cgmanifests/generate_cgmanifest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 52bd3f58645f2..b2e8f6816a2ef 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -73,7 +73,7 @@ def add_github_dep(name, parsed_url):
             return
         # Make a REST call to convert to tag to a git commit
         url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}"
-        print("requesting {url} ...")
+        print(f"requesting {url} ...")
         res = requests.get(url, auth=(args.username, args.token))
         response_json = res.json()
         tag_object = response_json["object"]

From 7db7c4e5c80eeecd75dd66a9fa691ac32c3a8a98 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 26 Jul 2024 14:54:45 -0700
Subject: [PATCH 29/57] Separating all GPU stages into different Pipelines
 (#21521)

### Description
Separating all GPU stages into different Pipelines
---
 .../win-gpu-cuda-ci-pipeline.yml              | 64 +++++++++++++++++++
 .../win-gpu-dml-ci-pipeline.yml               | 52 +++++++++++++++
 .../win-gpu-doc-gen-ci-pipeline.yml           | 61 ++++++++++++++++++
 .../win-gpu-training-ci-pipeline.yml          | 63 ++++++++++++++++++
 tools/ci_build/set-trigger-rules.py           |  5 +-
 5 files changed, 244 insertions(+), 1 deletion(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml

diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
new file mode 100644
index 0000000000000..78e1624b5d123
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -0,0 +1,64 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+stages:
+- stage: cuda
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --enable_cuda_profiling --enable_transformers_tool_test
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: CUDA
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
new file mode 100644
index 0000000000000..904979f39ca31
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -0,0 +1,52 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+stages:
+- stage: dml
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env.bat
+        buildArch: x64
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: DML
+        WITH_CACHE: false
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
new file mode 100644
index 0000000000000..4106889331350
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml
@@ -0,0 +1,61 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+stages:
+- stage: kernelDocumentation
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: false
+        GenerateDocumentation: true
+        ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation.
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml
new file mode 100644
index 0000000000000..3bb6c267f0018
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml
@@ -0,0 +1,63 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+stages:
+- stage: training
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: CUDA
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-GPU-A10
+        isTraining: true
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index d26fec41033ca..0d90061e9c687 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -34,7 +34,10 @@
     "orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml",
     "orttraining-mac-ci-pipeline.yml",
     "win-ci-pipeline.yml",
-    "win-gpu-ci-pipeline.yml",
+    "win-gpu-ci-dml-pipeline.yml",
+    "win-gpu-ci-cuda-pipeline.yml",
+    "win-gpu-ci-training-pipeline.yml",
+    "win-gpu-ci-doc-gen-pipeline.yml",
     "win-gpu-tensorrt-ci-pipeline.yml",
     "win-qnn-arm64-ci-pipeline.yml",
     "win-qnn-ci-pipeline.yml",

From fb61e14153b6a1263c15ea3b62d6bbbc5bde9848 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 26 Jul 2024 16:56:44 -0700
Subject: [PATCH 30/57] Add QNN EP option context_node_name_prefix to set
 EPContext node name prefix (#21236)

### Description
Add QNN EP option context_node_name_prefix to set EPContext node name prefix

### Motivation and Context
For the case to workaround QNN context PD memory limit, user need split the model into pieces and generate the QNN context model separately. It could happen that the generated EPContext node in separate graph has same node name. This will cause issue if glue those EPContext nodes together into a single model.
To avoid this user can set this context_node_name_prefix for each split pieces to make the node name unique.
---
 .../onnxruntime_session_options_config_keys.h |  4 ++
 .../providers/qnn/qnn_execution_provider.cc   |  9 ++++-
 .../providers/qnn/qnn_execution_provider.h    |  1 +
 .../test/providers/qnn/qnn_ep_context_test.cc | 39 +++++++++++++++++++
 4 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 17ae649e6f174..209fd4279cc99 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -265,6 +265,10 @@ static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_p
 // "1": dump the EP context into the Onnx model. (default).
 static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
 
+// Specify the EPContext node name prefix to make it unique
+// in case user need to merge/connect multiple EPContext nodes in one model
+static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
+
 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
 // Option values:
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 539b456cb657f..c56a47e67497e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -199,6 +199,13 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
 
     context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
+
+    // For the case that workaround QNN context PD memory limit, user need split the model into pieces and
+    // generate the QNN context model separately.
+    // It could happen that the generated EPContext node in separate graph has same node name.
+    // User can set this context_node_name_prefix for each split pieces to avoid that happens.
+    context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
+    LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_;
   }
 
   static const std::string BACKEND_PATH = "backend_path";
@@ -613,7 +620,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
     int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
-    return MakeString(QNN, "_", model_hash, "_", metadef_id);
+    return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id);
   };
 
   // For model with EPContext, make sure each partition only has one single EPContext node
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index e7419dabb14d1..f00ffb6cfdb96 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -80,6 +80,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>> qnn_models_;
   bool context_cache_enabled_ = false;
   std::string context_cache_path_cfg_ = "";
+  std::string context_node_name_prefix_ = "";
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index a3768cb98f584..be3bd2cc5dcd7 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -279,6 +279,45 @@ TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
+TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  std::string node_name_prefix = "node_name_prefix_test";
+
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+  so.AddConfigEntry(kOrtSessionOptionEpContextNodeNamePrefix, node_name_prefix.c_str());
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  for (auto& node : model->MainGraph().Nodes()) {
+    if (node.OpType() == "EPContext") {
+      EXPECT_TRUE(node.Name().find(node_name_prefix) != std::string::npos);
+    }
+  }
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
 // Run QDQ model on HTP 3 times
 // 1st run will generate the Qnn context cache onnx file
 // 2nd run directly loads and run from Qnn context cache model

From 64819f6f8cad8387b23d7cc8af1a4b4207e2dfbb Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 26 Jul 2024 18:45:14 -0700
Subject: [PATCH 31/57] Update benchmark_mha.py to compare with PyTorch SDPA
 (#21449)

### Description
* Update benchmark_mha.py to compare with PyTorch SDPA api.
* Write results to csv file.
* Use sdpa_kernel cuda provider option instead of environment variables
for better control.
* Add arguments (`--use_gpu`, `--causal` etc) to allow testing different
senarios.
* Update benchmark_mha.sh to add cpu benchmarks

For Q,K,V format, torch uses BNSH format, while ort uses BSNH format, so
the result is not apple-to-apple. However, if the latency difference is
large, that could be a warning.

#### Example GPU results

Example results on A100-SXM4-80GB with settings (use_gpu=TRUE,
enable_cuda_graph=FALSE, causal=FALSE, past_sequence_length=0,
intra_op_num_threads=0) in Azure Linux. ORT: build from source with CUDA
12.5; PyTorch 2.3.1 for cuda 12.1.

format | batch_size | sequence_length | num_heads | head_size | latency
(s) | tflops | kernel
-- | -- | -- | -- | -- | -- | -- | --
Q,KV | 4 | 2048 | 32 | 128 | 0.0015 | 179.5 | ort:flash
Q,KV | 4 | 2048 | 32 | 128 | 0.0015 | 179.0 | ort:default
Q,K,V | 4 | 2048 | 32 | 128 | 0.0016 | 170.0 | ort:default
Q,K,V | 4 | 2048 | 32 | 128 | 0.0016 | 169.5 | ort:flash
QKV | 4 | 2048 | 32 | 128 | 0.0016 | 168.5 | ort:default
QKV | 4 | 2048 | 32 | 128 | 0.0016 | 167.4 | ort:flash
Q,K,V | 4 | 2048 | 32 | 128 | 0.0017 | 159.4 | torch:default
Q,K,V | 4 | 2048 | 32 | 128 | 0.0018 | 155.0 | torch:flash
Q,KV | 4 | 2048 | 32 | 128 | 0.0030 | 92.7 | ort:efficient
Q,K,V | 4 | 2048 | 32 | 128 | 0.0030 | 90.9 | ort:efficient
QKV | 4 | 2048 | 32 | 128 | 0.0031 | 89.9 | ort:efficient
Q,K,V | 4 | 2048 | 32 | 128 | 0.0031 | 89.0 | torch:efficient
Q,K,V | 4 | 2048 | 32 | 128 | 0.0054 | 51.3 | torch:math
Q,KV | 4 | 4096 | 32 | 128 | 0.0058 | 191.0 | ort:default
Q,KV | 4 | 4096 | 32 | 128 | 0.0058 | 190.6 | ort:flash
Q,K,V | 4 | 4096 | 32 | 128 | 0.0059 | 187.8 | ort:default
Q,K,V | 4 | 4096 | 32 | 128 | 0.0059 | 186.7 | ort:flash
QKV | 4 | 4096 | 32 | 128 | 0.0059 | 185.9 | ort:flash
QKV | 4 | 4096 | 32 | 128 | 0.0059 | 185.8 | ort:default
Q,K,V | 4 | 4096 | 32 | 128 | 0.0067 | 163.4 | torch:default
Q,K,V | 4 | 4096 | 32 | 128 | 0.0070 | 157.2 | torch:flash
Q,KV | 4 | 4096 | 32 | 128 | 0.0113 | 97.6 | ort:efficient
Q,K,V | 4 | 4096 | 32 | 128 | 0.0114 | 96.4 | ort:efficient
QKV | 4 | 4096 | 32 | 128 | 0.0114 | 96.2 | ort:efficient
Q,K,V | 4 | 4096 | 32 | 128 | 0.0127 | 86.3 | torch:efficient
Q,KV | 8 | 2048 | 32 | 128 | 0.0031 | 177.8 | ort:flash
Q,KV | 8 | 2048 | 32 | 128 | 0.0031 | 177.7 | ort:default
Q,K,V | 8 | 2048 | 32 | 128 | 0.0032 | 170.8 | ort:default
Q,K,V | 8 | 2048 | 32 | 128 | 0.0032 | 170.3 | ort:flash
QKV | 8 | 2048 | 32 | 128 | 0.0032 | 169.2 | ort:default
QKV | 8 | 2048 | 32 | 128 | 0.0033 | 169.0 | ort:flash
Q,K,V | 8 | 2048 | 32 | 128 | 0.0034 | 161.9 | torch:default
Q,K,V | 8 | 2048 | 32 | 128 | 0.0036 | 152.9 | torch:flash
Q,KV | 8 | 2048 | 32 | 128 | 0.0059 | 93.5 | ort:efficient
Q,K,V | 8 | 2048 | 32 | 128 | 0.0060 | 91.3 | ort:efficient
QKV | 8 | 2048 | 32 | 128 | 0.0060 | 91.0 | ort:efficient
Q,K,V | 8 | 2048 | 32 | 128 | 0.0064 | 86.0 | torch:efficient
Q,KV | 8 | 4096 | 32 | 128 | 0.0115 | 190.8 | ort:flash
Q,KV | 8 | 4096 | 32 | 128 | 0.0115 | 190.7 | ort:default
Q,K,V | 8 | 4096 | 32 | 128 | 0.0118 | 187.1 | ort:default
Q,K,V | 8 | 4096 | 32 | 128 | 0.0118 | 187.0 | ort:flash
QKV | 8 | 4096 | 32 | 128 | 0.0118 | 185.6 | ort:default
QKV | 8 | 4096 | 32 | 128 | 0.0118 | 185.6 | ort:flash
Q,K,V | 8 | 4096 | 32 | 128 | 0.0139 | 158.7 | torch:default
Q,K,V | 8 | 4096 | 32 | 128 | 0.0139 | 158.3 | torch:flash
Q,KV | 8 | 4096 | 32 | 128 | 0.0225 | 97.7 | ort:efficient
Q,K,V | 8 | 4096 | 32 | 128 | 0.0227 | 96.8 | ort:efficient
QKV | 8 | 4096 | 32 | 128 | 0.0228 | 96.3 | ort:efficient
Q,K,V | 8 | 4096 | 32 | 128 | 0.0260 | 84.5 | torch:efficient

#### Example CPU results

Dell XPS 8960 with i9-13900 CPU (use_gpu=FALSE, causal=FALSE,
past_sequence_length=0) in Windows. ORT: build from source with CUDA
12.5; PyTorch 2.3.1 for cuda 12.1.

format | causal | batch_size | seq_len | num_heads | head_size | threads
| latency (s) | kernel
-- | -- | -- | -- | -- | -- | -- | -- | --
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 8 | 0.0005 | ort:flash
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 0 | 0.0009 | ort:flash
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 0 | 0.0009 | ort:math
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 4 | 0.0009 | ort:flash
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 2 | 0.0014 | ort:flash
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 1 | 0.0025 | ort:flash
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 2 | 0.0045 | torch:default
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 24 | 0.0046 | torch:default
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 8 | 0.0046 | torch:default
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 4 | 0.0046 | torch:default
Q,K,V | FALSE | 1 | 128 | 32 | 128 | 1 | 0.0047 | torch:default
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 0 | 0.0019 | ort:flash
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 8 | 0.0019 | ort:flash
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 0 | 0.0022 | ort:math
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 4 | 0.0030 | ort:flash
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 2 | 0.0047 | ort:flash
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 1 | 0.0086 | ort:flash
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 2 | 0.0161 | torch:default
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 4 | 0.0162 | torch:default
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 8 | 0.0162 | torch:default
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 24 | 0.0165 | torch:default
Q,K,V | FALSE | 1 | 256 | 32 | 128 | 1 | 0.0166 | torch:default
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 8 | 0.0077 | ort:flash
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 0 | 0.0091 | ort:flash
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 0 | 0.0099 | ort:math
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 4 | 0.0103 | ort:flash
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 2 | 0.0177 | ort:flash
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 1 | 0.0328 | ort:flash
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 2 | 0.0624 | torch:default
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 4 | 0.0624 | torch:default
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 8 | 0.0625 | torch:default
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 24 | 0.0626 | torch:default
Q,K,V | FALSE | 1 | 512 | 32 | 128 | 1 | 0.0640 | torch:default
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 8 | 0.0286 | ort:flash
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 0 | 0.0317 | ort:flash
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 4 | 0.0367 | ort:flash
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 0 | 0.0391 | ort:math
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 2 | 0.0656 | ort:flash
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 1 | 0.1235 | ort:flash
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 24 | 0.2482 | torch:default
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 2 | 0.2483 | torch:default
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 4 | 0.2483 | torch:default
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 8 | 0.2486 | torch:default
Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 1 | 0.2538 | torch:default
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 0 | 0.1038 | ort:flash
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 8 | 0.1050 | ort:flash
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 0 | 0.1368 | ort:math
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 4 | 0.1535 | ort:flash
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 2 | 0.2461 | ort:flash
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 1 | 0.4724 | ort:flash
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 8 | 0.9835 | torch:default
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 4 | 0.9841 | torch:default
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 24 | 0.9841 | torch:default
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 2 | 0.9873 | torch:default
Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 1 | 0.9985 | torch:default


### Motivation and Context
To compare with PyTorch SDPA on CPU and CUDA latency.
---
 .../python/transformers/benchmark_mha.cmd     |  47 ++
 .../test/python/transformers/benchmark_mha.py | 690 +++++++++++++-----
 .../test/python/transformers/benchmark_mha.sh |  48 +-
 .../test/python/transformers/test_mha.py      |  46 +-
 4 files changed, 609 insertions(+), 222 deletions(-)
 create mode 100644 onnxruntime/test/python/transformers/benchmark_mha.cmd

diff --git a/onnxruntime/test/python/transformers/benchmark_mha.cmd b/onnxruntime/test/python/transformers/benchmark_mha.cmd
new file mode 100644
index 0000000000000..0a6d0c37b4a35
--- /dev/null
+++ b/onnxruntime/test/python/transformers/benchmark_mha.cmd
@@ -0,0 +1,47 @@
+echo "Benchmark Scaled Dot Product Attention (SDPA) performance on GPU:"
+
+set CUDA_VISIBLE_DEVICES=0
+python benchmark_mha.py --use_gpu
+python benchmark_mha.py --use_gpu --use_cuda_graph
+python benchmark_mha.py --use_gpu --torch
+
+type benchmark_mha_gpu_*.csv > mha_gpu_benchmark_results.csv
+
+echo "Benchmark performance on CPU with number of threads:"
+set MKL_DYNAMIC=FALSE
+set OMP_NUM_THREADS=1
+python benchmark_mha.py --torch
+
+set OMP_NUM_THREADS=2
+python benchmark_mha.py --torch
+
+set OMP_NUM_THREADS=4
+python benchmark_mha.py --torch
+
+set OMP_NUM_THREADS=8
+python benchmark_mha.py --torch
+
+set MKL_DYNAMIC=
+set OMP_NUM_THREADS=
+
+set ORT_DISABLE_FLASH_ATTENTION=0
+python benchmark_mha.py --intra_op_num_threads 1
+python benchmark_mha.py --intra_op_num_threads 2
+python benchmark_mha.py --intra_op_num_threads 4
+python benchmark_mha.py --intra_op_num_threads 8
+
+echo "Benchmark performance on CPU with default threads settings:"
+python benchmark_mha.py
+
+python benchmark_mha.py --torch
+
+python benchmark_mha.py --causal
+python benchmark_mha.py --torch --causal
+
+python benchmark_mha.py --causal --has_past
+
+set ORT_DISABLE_FLASH_ATTENTION=1
+python benchmark_mha.py
+set ORT_DISABLE_FLASH_ATTENTION=
+
+type benchmark_mha_cpu_*.csv > mha_cpu_benchmark_results.csv
diff --git a/onnxruntime/test/python/transformers/benchmark_mha.py b/onnxruntime/test/python/transformers/benchmark_mha.py
index 111c417479d20..715a92431e6bf 100644
--- a/onnxruntime/test/python/transformers/benchmark_mha.py
+++ b/onnxruntime/test/python/transformers/benchmark_mha.py
@@ -4,21 +4,35 @@
 # --------------------------------------------------------------------------
 
 """
-Benchmark performance of MultiHeadAttention with Nvidia GPU of Compute Capability 8.0, 8.6 or 8.9 in Linux:
-sh benchmark_mha.sh
+Benchmark performance of MultiHeadAttention with ORT or PyTorch.
+
+In Linux, run the the following:
+   sh benchmark_mha.sh
+
+In Windows, run the the following:
+   benchmark_mha.cmd
 """
 
+import argparse
+import csv
 import math
 import os
 import platform
 import statistics
 import time
-from typing import List, Optional
+from contextlib import nullcontext
+from datetime import datetime
+from enum import IntEnum
+from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
+import torch.utils.benchmark as benchmark
 from onnx import TensorProto, helper
+from packaging.version import Version
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.functional import scaled_dot_product_attention
 
-from onnxruntime import InferenceSession, get_available_providers
+from onnxruntime import InferenceSession, SessionOptions, get_available_providers
 from onnxruntime.transformers.io_binding_helper import CudaSession
 
 
@@ -43,6 +57,20 @@ def get_name_list() -> List[str]:
         return ["Q,K,V", "QKV", "Q,KV", "Q,K',V'"]
 
 
+class SdpaKernel(IntEnum):
+    """Bit flags for sdpa_kernel CUDA provider option"""
+
+    DEFAULT = 0
+    FLASH_ATTENTION = 1
+    EFFICIENT_ATTENTION = 2
+    TRT_FUSED_ATTENTION = 4
+    CUDNN_FLASH_ATTENTION = 8
+    MATH = 16
+    TRT_FLASH_ATTENTION = 32
+    TRT_CROSS_ATTENTION = 64
+    TRT_CAUSAL_ATTENTION = 128
+
+
 class MultiHeadAttentionConfig:
     def __init__(
         self,
@@ -62,6 +90,7 @@ def __init__(
         use_kv_cache: bool = False,
         share_past_present_buffer: bool = False,
         input_format: int = InputFormats.Q_K_V_BSNH_BSNH_BSNH,
+        verbose: bool = False,
     ):
         self.operator = "MultiHeadAttention"
         self.batch_size = batch_size
@@ -100,6 +129,7 @@ def __init__(
         self.input_format = input_format
         self.is_packed_qkv = input_format == InputFormats.QKV_BSN3H
         self.is_packed_kv = input_format == InputFormats.Q_KV_BSNH_BSN2H
+        self.verbose = verbose
 
     def __repr__(self):
         return (
@@ -114,89 +144,93 @@ def __repr__(self):
         )
 
     def shape_dict(self, input_format=None):
+        shapes: Dict[str, Tuple] = {
+            "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
+        }
+
         input_format = input_format or self.input_format
-        if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
-            # cross attention does not have past state
-            return {
+        if input_format == InputFormats.QKV_BSN3H:
+            shapes = {
+                **shapes,
+                "query": (self.batch_size, self.sequence_length, self.num_heads, 3, self.head_size),
+            }
+        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
+            shapes = {
+                **shapes,
+                "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
+                "key": (self.batch_size, self.sequence_length, self.num_heads, 2, self.head_size),
+            }
+        elif input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH:
+            shapes = {
+                **shapes,
+                "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
+                "key": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
+                "value": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
+            }
+        else:
+            assert input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH
+            shapes = {
+                **shapes,
                 "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
                 "key": (self.batch_size, self.num_heads, self.sequence_length, self.head_size),
                 "value": (self.batch_size, self.num_heads, self.sequence_length, self.head_size),
-                "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
             }
 
         if self.use_kv_cache:
+            assert input_format != InputFormats.Q_K_V_BSNH_BNSH_BNSH, "cross attention shall not have past state"
             shapes = {
+                **shapes,
                 "past_key": (self.batch_size, self.num_heads, self.past_buffer_length, self.head_size),
                 "past_value": (self.batch_size, self.num_heads, self.past_buffer_length, self.head_size),
-                "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
                 "present_key": (self.batch_size, self.num_heads, self.present_buffer_length, self.head_size),
                 "present_value": (self.batch_size, self.num_heads, self.present_buffer_length, self.head_size),
             }
-        else:
-            shapes = {
-                "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
-            }
 
-        if input_format == InputFormats.QKV_BSN3H:
-            shapes.update({"query": (self.batch_size, self.sequence_length, self.num_heads, 3, self.head_size)})
-        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
-            shapes.update(
-                {
-                    "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
-                    "key": (self.batch_size, self.sequence_length, self.num_heads, 2, self.head_size),
-                }
-            )
-        else:  # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH
-            shapes.update(
-                {
-                    "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
-                    "key": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
-                    "value": (self.batch_size, self.sequence_length, self.num_heads * self.head_size),
-                }
-            )
         return shapes
 
     def symbolic_shape_dict(self, input_format=None):
+        shapes: Dict[str, Tuple] = {
+            "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+        }
+
         input_format = input_format or self.input_format
-        if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
-            # cross attention does not have past state
-            return {
+        if input_format == InputFormats.QKV_BSN3H:
+            shapes = {
+                **shapes,
+                "query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size),
+            }
+        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
+            shapes = {
+                **shapes,
+                "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size),
+            }
+        elif input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH:
+            shapes = {
+                **shapes,
+                "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "key": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "value": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+            }
+        else:
+            assert input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH
+            shapes = {
+                **shapes,
                 "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
                 "key": ("batch_size", self.num_heads, "sequence_length", self.head_size),
                 "value": ("batch_size", self.num_heads, "sequence_length", self.head_size),
-                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
             }
 
         if self.use_kv_cache:
+            assert input_format != InputFormats.Q_K_V_BSNH_BNSH_BNSH, "cross attention shall not have past state"
             shapes = {
+                **shapes,
                 "past_key": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
                 "past_value": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
-                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
                 "present_key": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
                 "present_value": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
             }
-        else:
-            shapes = {
-                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
-            }
 
-        if input_format == InputFormats.QKV_BSN3H:
-            shapes.update({"query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size)})
-        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
-            shapes.update(
-                {
-                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
-                    "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size),
-                }
-            )
-        else:  # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH
-            shapes.update(
-                {
-                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
-                    "key": ("batch_size", "sequence_length", self.num_heads * self.head_size),
-                    "value": ("batch_size", "sequence_length", self.num_heads * self.head_size),
-                }
-            )
         return shapes
 
     def random_inputs(self, seed: int = 123):
@@ -215,44 +249,42 @@ def random_inputs(self, seed: int = 123):
         k_bnsh = k.transpose(1, 2)
         v_bnsh = v.transpose(1, 2)
 
-        if self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
-            return {
+        if self.input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH:
+            feeds = {
                 "query": q.reshape(shape_dict["query"]),
-                "key": k_bnsh.contiguous(),
-                "value": v_bnsh.contiguous(),
+                "key": k.reshape(shape_dict["key"]),
+                "value": v.reshape(shape_dict["value"]),
             }
-
-        feeds = {}
-        if self.use_kv_cache:
-            feeds.update(
-                {
-                    "past_key": torch.empty(shape_dict["past_key"], device=device, dtype=dtype).normal_(
-                        mean=0, std=0.1
-                    ),
-                    "past_value": torch.empty(shape_dict["past_value"], device=device, dtype=dtype).normal_(
-                        mean=0, std=0.1
-                    ),
-                }
-            )
-
-        if self.input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH:
-            feeds.update(
-                {
-                    "query": q.reshape(shape_dict["query"]),
-                    "key": k.reshape(shape_dict["key"]),
-                    "value": v.reshape(shape_dict["value"]),
-                }
-            )
         elif self.input_format == InputFormats.QKV_BSN3H:
             query = q.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size)
             key = k.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size)
             value = v.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size)
-            feeds["query"] = torch.dstack((query, key, value)).reshape(shape_dict["query"]).contiguous()
+            feeds = {
+                "query": torch.dstack((query, key, value)).reshape(shape_dict["query"]).contiguous(),
+            }
         elif self.input_format == InputFormats.Q_KV_BSNH_BSN2H:
             key = k.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size)
             value = v.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size)
-            feeds["query"] = q.reshape(shape_dict["query"])
-            feeds["key"] = torch.dstack((key, value)).reshape(shape_dict["key"]).contiguous()
+            feeds = {
+                "query": q.reshape(shape_dict["query"]),
+                "key": torch.dstack((key, value)).reshape(shape_dict["key"]).contiguous(),
+            }
+        else:
+            assert self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH
+            feeds = {
+                "query": q.reshape(shape_dict["query"]),
+                "key": k_bnsh.contiguous(),
+                "value": v_bnsh.contiguous(),
+            }
+
+        if self.use_kv_cache:
+            feeds = {
+                **feeds,
+                "past_key": torch.empty(shape_dict["past_key"], device=device, dtype=dtype).normal_(mean=0, std=0.1),
+                "past_value": torch.empty(shape_dict["past_value"], device=device, dtype=dtype).normal_(
+                    mean=0, std=0.1
+                ),
+            }
 
         return feeds
 
@@ -318,19 +350,32 @@ def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig, use
     return model.SerializeToString()
 
 
-def create_session(
+def create_ort_session(
     config: MultiHeadAttentionConfig,
+    session_options=None,
+    attention_kernel=SdpaKernel.DEFAULT,
+    use_symbolic_shape: bool = True,
 ) -> CudaSession:
-    onnx_model_str = create_multi_head_attention_onnx_model(config)
+    if config.verbose:
+        print(f"create session for {vars(config)}")
+    onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=use_symbolic_shape)
 
     if config.provider == "CUDAExecutionProvider":
         device_id = torch.cuda.current_device() if isinstance(config.device, str) else config.device.index
         provider_options = CudaSession.get_cuda_provider_options(device_id, config.enable_cuda_graph)
+        provider_options["sdpa_kernel"] = int(attention_kernel)
         providers = [(config.provider, provider_options), "CPUExecutionProvider"]
     else:
         providers = ["CPUExecutionProvider"]
 
-    ort_session = InferenceSession(onnx_model_str, providers=providers)
+    ort_session = InferenceSession(onnx_model_str, session_options, providers=providers)
+    return ort_session
+
+
+def create_session(
+    config: MultiHeadAttentionConfig, session_options=None, attention_kernel=SdpaKernel.DEFAULT
+) -> CudaSession:
+    ort_session = create_ort_session(config, session_options, attention_kernel, use_symbolic_shape=False)
     cuda_session = CudaSession(ort_session, config.device, config.enable_cuda_graph)
     shape_dict = config.shape_dict()
     cuda_session.allocate_buffers(shape_dict)
@@ -340,11 +385,8 @@ def create_session(
 class OrtMultiHeadAttention:
     """A wrapper of ORT MultiHeadAttention to test relevance and performance."""
 
-    def __init__(
-        self,
-        config: MultiHeadAttentionConfig,
-    ):
-        self.ort_session = create_session(config)
+    def __init__(self, config: MultiHeadAttentionConfig, session_options=None):
+        self.ort_session = create_session(config, session_options)
         self.feed_dict = config.random_inputs()
 
     def infer(self):
@@ -363,53 +405,90 @@ def flops(batch, sequence_length, head_size, num_heads, causal):
 
 
 def tflops_per_second(flop, time):
-    return (flop / time / 10**12) if not math.isnan(time) else 0.0
-
-
-def get_gpu_kernel_name(config: MultiHeadAttentionConfig) -> str:
-    # This classification is for Nvidia GPU of Compute Capability 8.* like A100.
-    # Note that some kernel might not exist in older or newer GPUs.
-    if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1":
-        if config.input_format == InputFormats.QKV_BSN3H:
-            min_seq_len = os.getenv("ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV")
-            min_length = int(min_seq_len) if min_seq_len is not None else 513
-            if config.sequence_length >= min_length:
-                return "Flash"
-        else:
-            return "Flash"
+    try:
+        return (flop / time / 10**12) if not math.isnan(time) else 0.0
+    except ZeroDivisionError:
+        return None
+
+
+def get_gpu_kernel_name(attention_kernel: SdpaKernel) -> str:
+    kernel_names = {
+        SdpaKernel.DEFAULT: "ort:default",
+        SdpaKernel.FLASH_ATTENTION: "ort:flash",
+        SdpaKernel.EFFICIENT_ATTENTION: "ort:efficient",
+        SdpaKernel.CUDNN_FLASH_ATTENTION: "ort:cudnn",
+        SdpaKernel.MATH: "ort:math",
+    }
+    assert attention_kernel in kernel_names
+    return kernel_names[attention_kernel]
 
-    if (os.getenv("ORT_DISABLE_FUSED_CROSS_ATTENTION") != "1" and config.kv_sequence_length <= 128) or (
-        os.getenv("ORT_DISABLE_FUSED_ATTENTION") != "1"
-        and (config.sequence_length <= 384 or os.getenv("ORT_DISABLE_TRT_FLASH_ATTENTION") != "1")
-    ):
-        return "TRT"
 
-    if os.getenv("ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION") != "1":
-        return "MemEff"
+def get_cpu_kernel_name(config: MultiHeadAttentionConfig) -> str:
+    # CPU Flash Attention does not support causal and kv cache etc.
+    if not (config.causal or config.use_kv_cache or config.past_sequence_length > 0):
+        if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1":
+            return "ort:flash"
 
-    return "Unfused"
+    return "ort:math"
 
 
-def get_cpu_kernel_name() -> str:
-    if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1":
-        return "CPU:Flash"
-    return "CPU:Unfused"
+# ------------------------------------------------------------------
+# Functions for benchmarking PyTorch SDPA
+# ------------------------------------------------------------------
+def benchmark_torch_function(func: Callable, *args, **kwargs) -> float:
+    warmup = 5
+    repeats = 100
+    for _ in range(warmup):
+        func(*args, **kwargs)
 
+    timer = benchmark.Timer(
+        stmt="func(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "func": func},
+    )
+
+    return timer.timeit(number=repeats).median
 
-def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repeats: int = 100):
-    if use_gpu:
-        device_id = torch.cuda.current_device()
-        device = torch.device("cuda", device_id)
-        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H]
-        provider = "CUDAExecutionProvider"
-        print(f"enable_cuda_graph={enable_cuda_graph}")
-    else:
-        device_id = 0
-        device = torch.device("cpu")
-        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH]
-        enable_cuda_graph = False
-        provider = "CPUExecutionProvider"
 
+def run_torch_sdpa(
+    batch_size: int,
+    q_seq_len: int,
+    kv_seq_len: int,
+    num_heads: int,
+    head_size: int,
+    causal: bool,
+    device,
+    dtype,
+    has_mask: bool = False,
+    mask_dim: int = 2,
+    mask_dtype=torch.bool,
+    backend: Optional[int] = None,
+):
+    q_shape = (batch_size, num_heads, q_seq_len, head_size)
+    kv_shape = (batch_size, num_heads, kv_seq_len, head_size)
+    q = torch.randn(q_shape, device=device, dtype=dtype)
+    k = torch.randn(kv_shape, device=device, dtype=dtype)
+    v = torch.randn(kv_shape, device=device, dtype=dtype)
+
+    attn_mask = None
+    if has_mask:
+        mask_shape = (batch_size, num_heads, q_seq_len, kv_seq_len) if mask_dim == 4 else (q_seq_len, kv_seq_len)
+        attn_mask = torch.ones(mask_shape, dtype=mask_dtype, device=device)
+
+    context = sdpa_kernel(backend) if backend is not None else nullcontext()
+
+    with context:
+        average_latency = benchmark_torch_function(
+            scaled_dot_product_attention,
+            q,
+            k,
+            v,
+            is_causal=causal,
+            attn_mask=attn_mask,
+        )
+    return average_latency
+
+
+def get_test_configs(use_gpu: bool = True):
     if use_gpu:
         # (batch_size, sequence_length, past_sequence_length, num_heads, head_size, run_unfused)
         configs = [
@@ -450,31 +529,70 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea
         ]
     else:
         configs = [
+            # TNLGv4
             (1, 128, 0, 32, 128, True),
             (1, 256, 0, 32, 128, True),
             (1, 512, 0, 32, 128, True),
             (1, 1024, 0, 32, 128, True),
-            (1, 2048, 0, 32, 128, True),
+            # (1, 2048, 0, 32, 128, True),
+            # bert-base
+            (1, 128, 0, 12, 64, True),
+            (1, 384, 0, 12, 64, True),
+            (1, 512, 0, 12, 64, True),
+            (4, 128, 0, 12, 64, True),
+            (4, 384, 0, 12, 64, True),
+            (4, 512, 0, 12, 64, True),
+            # bert-large
+            (1, 128, 0, 16, 64, True),
+            (1, 384, 0, 16, 64, True),
+            (1, 512, 0, 16, 64, True),
+            (4, 128, 0, 16, 64, True),
+            (4, 384, 0, 16, 64, True),
+            (4, 512, 0, 16, 64, True),
         ]
+    return configs
+
+
+def get_compute_capability():
+    assert torch.cuda.is_available()
+    major, minor = torch.cuda.get_device_capability()
+    sm = major * 10 + minor
+    return sm
 
-    # List of environment variables to enable/disable attention kernels
-    print("Environment Variables:")
-    env_names = [
-        "ORT_DISABLE_FLASH_ATTENTION",
-        "ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV",
-        "ORT_DISABLE_FUSED_ATTENTION",
-        "ORT_DISABLE_TRT_FLASH_ATTENTION",
-        "ORT_ENABLE_FUSED_CAUSAL_ATTENTION",
-        "ORT_DISABLE_FUSED_CROSS_ATTENTION",
-        "ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION",
-    ]
-    for name in env_names:
-        value = os.getenv(name)
-        if value is not None:
-            print(f"{name}={value}")
 
-    print("\nformat\tcausal\tbatch\tseqlen\theads\th_dim\tms\tTFLOPS\tkernel")
-    causal = False
+def run_tflops_test(
+    csv_writer: csv.DictWriter,
+    use_gpu: bool = True,
+    enable_cuda_graph: bool = False,
+    causal: bool = False,
+    has_past: bool = False,
+    intra_op_num_threads: int = 0,
+    repeats: int = 100,
+):
+    print(f"run_tflops_test: causal={causal}")
+
+    if use_gpu:
+        device_id = torch.cuda.current_device()
+        device = torch.device("cuda", device_id)
+        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H]
+        provider = "CUDAExecutionProvider"
+        # flash attention is available for sm >= 80
+        sm = get_compute_capability()
+        if sm >= 80:
+            backends = [SdpaKernel.DEFAULT, SdpaKernel.FLASH_ATTENTION, SdpaKernel.EFFICIENT_ATTENTION]
+        else:
+            backends = [SdpaKernel.DEFAULT, SdpaKernel.EFFICIENT_ATTENTION]
+    else:
+        device_id = 0
+        device = torch.device("cpu")
+        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH]
+        enable_cuda_graph = False
+        provider = "CPUExecutionProvider"
+        backends = [SdpaKernel.DEFAULT]
+
+    configs = get_test_configs(use_gpu)
+
+    print("\nformat\tcausal\tprompt\tbatch\tseqlen\theads\th_dim\tthreads\tms\tTFLOPS\tkernel")
 
     for input_format in formats:
         for batch_size, sequence_length, past_sequence_length, num_heads, head_size, enable_unfused in configs:
@@ -496,21 +614,27 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea
                     share_past_present_buffer=False,
                     input_format=input_format,
                 )
-
-                session = create_session(config)
+            for attention_kernel in backends:
+                sess_options = SessionOptions()
+                sess_options.intra_op_num_threads = intra_op_num_threads
+                session = create_session(config, sess_options, attention_kernel=attention_kernel)
 
                 if use_gpu:
-                    kernel = get_gpu_kernel_name(config)
+                    kernel = get_gpu_kernel_name(attention_kernel)
                 else:
-                    kernel = get_cpu_kernel_name()
+                    kernel = get_cpu_kernel_name(config)
 
-                if kernel == "Unfused":
+                if "math" in kernel:
                     # Skip large sequence length for Unfused kernel to avoid OOM.
                     if not enable_unfused:
+                        if config.verbose:
+                            print(f"skip unfused kernel for {vars(config)}")
                         continue
 
                     # Unfused kernel does not support packed QKV or packed KV formats.
                     if input_format not in [InputFormats.Q_K_V_BSNH_BSNH_BSNH]:
+                        if config.verbose:
+                            print(f"skip input_format for {vars(config)}")
                         continue
 
                 input_dict = config.random_inputs()
@@ -526,19 +650,168 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea
 
                 del session
 
+                format_str = InputFormats.input_format_str(input_format)
+
                 # compute TFLOPS per second
-                speed = tflops_per_second(
-                    flops(batch_size, sequence_length, head_size, num_heads, causal), average_latency
-                )
+                speed = None
+                if past_sequence_length == 0:
+                    speed = tflops_per_second(
+                        flops(batch_size, sequence_length, head_size, num_heads, causal), average_latency
+                    )
+
+                row = {
+                    "use_gpu": use_gpu,
+                    "enable_cuda_graph": enable_cuda_graph,
+                    "format": format_str,
+                    "causal": causal,
+                    "batch_size": batch_size,
+                    "sequence_length": sequence_length,
+                    "past_sequence_length": past_sequence_length,
+                    "num_heads": num_heads,
+                    "head_size": head_size,
+                    "intra_op_num_threads": intra_op_num_threads,
+                    "average_latency": average_latency,
+                    "tflops": speed,
+                    "kernel": kernel,
+                }
+                csv_writer.writerow(row)
 
-                format = InputFormats.input_format_str(input_format)
+                speed = f"{speed:.2f}" if speed is not None else "NA"
                 print(
-                    f"{format}\t{causal}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t{average_latency * 1000:.2f}\t{speed:.2f}\t{kernel}"
+                    f"{format_str}\t{causal}\t{not has_past}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t"
+                    f"{intra_op_num_threads}\t{average_latency * 1000:.2f}\t{speed}\t{kernel}"
                 )
 
 
+def run_torch_test(
+    csv_writer: csv.DictWriter,
+    use_gpu: bool = True,
+    causal: bool = False,
+):
+    configs = get_test_configs(use_gpu)
+
+    if use_gpu:
+        if not torch.cuda.is_available():
+            return
+        device_id = torch.cuda.current_device()
+        device = torch.device("cuda", device_id)
+        dtype = torch.float16
+        backends = [
+            None,
+            SDPBackend.FLASH_ATTENTION,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.CUDNN_ATTENTION,
+            SDPBackend.MATH,
+        ]
+    else:
+        device = torch.device("cpu")
+        dtype = torch.float32
+        backends = [None]
+
+    backend_names = {
+        SDPBackend.FLASH_ATTENTION: "torch:flash",
+        SDPBackend.EFFICIENT_ATTENTION: "torch:efficient",
+        SDPBackend.CUDNN_ATTENTION: "torch:cudnn",
+        SDPBackend.MATH: "torch:math",
+        None: "torch:default",
+    }
+
+    # Test PyTorch latency
+    for batch_size, sequence_length, past_sequence_length, num_heads, head_size, enable_unfused in configs:
+        for backend in backends:
+            if backend == SDPBackend.MATH and not enable_unfused:
+                continue
+            if backend == SDPBackend.FLASH_ATTENTION and platform.system() != "Linux":
+                continue
+
+            backend_name = backend_names[backend]
+            try:
+                with torch.no_grad():
+                    torch_latency = run_torch_sdpa(
+                        batch_size,
+                        sequence_length,
+                        sequence_length,
+                        num_heads,
+                        head_size,
+                        causal,
+                        has_mask=False,
+                        mask_dim=2,
+                        mask_dtype=torch.bool,
+                        device=device,
+                        dtype=dtype,
+                        backend=backend,
+                    )
+            except RuntimeError:
+                continue
+
+            speed = tflops_per_second(flops(batch_size, sequence_length, head_size, num_heads, causal), torch_latency)
+            input_format = "Q,K,V"
+            print(
+                f"{input_format}\t{causal}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t"
+                f"{0}\t{torch_latency * 1000:.2f}\t{speed:.2f}\t{backend_name}"
+            )
+            row = {
+                "use_gpu": use_gpu,
+                "enable_cuda_graph": False,
+                "format": input_format,
+                "causal": causal,
+                "batch_size": batch_size,
+                "sequence_length": sequence_length,
+                "past_sequence_length": past_sequence_length,
+                "num_heads": num_heads,
+                "head_size": head_size,
+                "intra_op_num_threads": torch.get_num_threads(),
+                "average_latency": torch_latency,
+                "tflops": speed,
+                "kernel": backend_name,
+            }
+            csv_writer.writerow(row)
+
+
+def run_tflops_tests(args):
+    features = "gpu" if args.use_gpu else "cpu"
+    if args.causal:
+        features += "_causal"
+    if args.has_past:
+        features += "_past"
+    csv_filename = "benchmark_mha_{}_{}_{}.csv".format(
+        features,
+        "torch" if args.torch else "ort",
+        datetime.now().strftime("%Y%m%d-%H%M%S"),
+    )
+    with open(csv_filename, mode="a", newline="") as csv_file:
+        column_names = [
+            "use_gpu",
+            "enable_cuda_graph",
+            "format",
+            "causal",
+            "batch_size",
+            "sequence_length",
+            "past_sequence_length",
+            "num_heads",
+            "head_size",
+            "intra_op_num_threads",
+            "average_latency",
+            "tflops",
+            "kernel",
+        ]
+        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
+        csv_writer.writeheader()
+
+        if args.torch:
+            run_torch_test(csv_writer, args.use_gpu, args.causal)
+        else:
+            run_tflops_test(
+                csv_writer,
+                use_gpu=args.use_gpu,
+                enable_cuda_graph=args.use_cuda_graph,
+                causal=args.causal,
+                has_past=args.has_past,
+                intra_op_num_threads=args.intra_op_num_threads,
+            )
+
+
 def plot_prompt_performance(
-    sm: int,
     model_name: str,
     batch_size: int,
     num_heads: int,
@@ -558,6 +831,7 @@ def plot_prompt_performance(
         "styles": [("red", "solid"), ("yellow", "dashdot"), ("blue", "dashed"), ("green", "dotted")][0 : len(formats)],
     }
 
+    sm = get_compute_capability()
     configs = [
         triton.testing.Benchmark(
             x_names=["sequence_length"],
@@ -591,13 +865,14 @@ def benchmark(
             sequence_length=sequence_length,
             num_heads=num_heads,
             head_size=head_size,
-            causal=True,
+            causal=False,
             past_sequence_length=0,
             kv_sequence_length=sequence_length if input_format == InputFormats.get_name_list()[-1] else None,
             max_cache_sequence_length=max_seq_len,
             provider="CUDAExecutionProvider",
             enable_cuda_graph=False,
             device=device,
+            dtype=torch.float16,
             use_kv_cache=False,
             input_format=InputFormats.convert(input_format),
         )
@@ -609,14 +884,14 @@ def benchmark(
     benchmark.run(save_path=".", print_data=True)
 
 
-def run_performance_test(sm: int):
+def run_bert_performance_test():
     """
     Run performance tests for prompt and token generation.
 
     """
     configures = [
-        (1, 32, 128, 8192, "TNLGv4"),
-        (4, 32, 128, 8192, "TNLGv4"),
+        # (1, 32, 128, 8192, "TNLGv4"),
+        # (4, 32, 128, 8192, "TNLGv4"),
         (1, 12, 64, 1024, "BertBase"),
         (16, 12, 64, 1024, "BertBase"),
         (1, 16, 64, 1024, "BertLarge"),
@@ -625,7 +900,6 @@ def run_performance_test(sm: int):
 
     for batch_size, num_heads, head_size, max_seq_len, model_name in configures:
         plot_prompt_performance(
-            sm=sm,
             batch_size=batch_size,
             num_heads=num_heads,
             head_size=head_size,
@@ -634,18 +908,84 @@ def run_performance_test(sm: int):
         )
 
 
+def _parse_arguments():
+    parser = argparse.ArgumentParser(description="Benchmark MultiHeadAttention for ONNX Runtime and PyTorch.")
+
+    parser.add_argument(
+        "--use_gpu",
+        required=False,
+        action="store_true",
+        help="Use GPU for inference.",
+    )
+    parser.set_defaults(use_gpu=False)
+
+    parser.add_argument(
+        "--use_cuda_graph",
+        required=False,
+        action="store_true",
+        help="Use cuda graph in onnxruntime.",
+    )
+    parser.set_defaults(use_cuda_graph=False)
+
+    parser.add_argument(
+        "--intra_op_num_threads",
+        required=False,
+        type=int,
+        choices=[0, 1, 2, 4, 8, 16],
+        default=0,
+        help="intra_op_num_threads for onnxruntime. ",
+    )
+
+    parser.add_argument(
+        "--has_past",
+        required=False,
+        action="store_true",
+        help="whether past_sequence_length > 0",
+    )
+    parser.set_defaults(has_past=False)
+
+    parser.add_argument(
+        "--causal",
+        required=False,
+        action="store_true",
+        help="test unidirectional",
+    )
+    parser.set_defaults(causal=False)
+
+    parser.add_argument(
+        "--torch",
+        required=False,
+        action="store_true",
+        help="test pytorch instead of onnxruntime",
+    )
+    parser.set_defaults(torch=False)
+
+    args = parser.parse_args()
+
+    return args
+
+
 if __name__ == "__main__":
-    if torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers():
-        # Test CUDA provider
-        major, minor = torch.cuda.get_device_capability()
-        sm = major * 10 + minor
+    args = _parse_arguments()
+    print(f"arguments:{args}")
+
+    if args.has_past:
+        assert args.causal, "--has_past need --causal specified"
+
+    if args.use_gpu:
+        assert args.torch or not args.causal, "no causal cuda kernel in MHA op"
+        assert torch.cuda.is_available()
+        if not args.torch:
+            assert "CUDAExecutionProvider" in get_available_providers()
 
+    if args.torch:
+        assert Version(torch.__version__) >= Version("2.3.0")
+        assert args.has_past is False
+
+    if args.use_gpu and not args.torch:
         if platform.system() == "Linux":
             s = torch.cuda.Stream()
             with torch.cuda.stream(s), torch.no_grad():
-                run_performance_test(sm)
-
-        run_tflops_test(use_gpu=True, enable_cuda_graph=True)
+                run_bert_performance_test()
 
-    # Test CPU provider
-    run_tflops_test(use_gpu=False, enable_cuda_graph=False)
+    run_tflops_tests(args)
diff --git a/onnxruntime/test/python/transformers/benchmark_mha.sh b/onnxruntime/test/python/transformers/benchmark_mha.sh
index 7b21cf1cc1e08..613543d0172dd 100644
--- a/onnxruntime/test/python/transformers/benchmark_mha.sh
+++ b/onnxruntime/test/python/transformers/benchmark_mha.sh
@@ -1,14 +1,40 @@
-echo "flash attention v2"
-ORT_DISABLE_FLASH_ATTENTION=0  ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV=0 python benchmark_mha.py | tee result.txt
+#!/bin/sh
 
-echo "==="
-echo "TensorRT attention kernels - cross attention (when kv_seq_len <= 128) or fused attention (when seq_len <= 384) or flash attention (seq_len > 384)"
-ORT_DISABLE_FLASH_ATTENTION=1  python benchmark_mha.py | tee -a result.txt
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
 
-echo "==="
-echo "Memory Efficient attention"
-ORT_DISABLE_FLASH_ATTENTION=1 ORT_DISABLE_TRT_FLASH_ATTENTION=1 ORT_DISABLE_FUSED_ATTENTION=1 ORT_DISABLE_FUSED_CROSS_ATTENTION=1 python benchmark_mha.py | tee -a result.txt
+echo "Benchmark Scaled Dot Product Attention (SDPA) performance on GPU:"
 
-echo "==="
-echo "Unfused Attention (some configurations might fail)"
-ORT_DISABLE_FLASH_ATTENTION=1 ORT_DISABLE_TRT_FLASH_ATTENTION=1 ORT_DISABLE_FUSED_ATTENTION=1 ORT_DISABLE_FUSED_CROSS_ATTENTION=1 ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION=1 python benchmark_mha.py | tee -a result.txt
+export CUDA_VISIBLE_DEVICES=0
+python benchmark_mha.py --use_gpu
+python benchmark_mha.py --use_gpu --use_cuda_graph
+python benchmark_mha.py --use_gpu --torch
+
+cat benchmark_mha_gpu_*.csv > mha_gpu_benchmark_results.csv
+
+echo "Benchmark performance on CPU with number of threads:"
+MKL_DYNAMIC=FALSE OMP_NUM_THREADS=1 python benchmark_mha.py --torch
+MKL_DYNAMIC=FALSE OMP_NUM_THREADS=2 python benchmark_mha.py --torch
+MKL_DYNAMIC=FALSE OMP_NUM_THREADS=4 python benchmark_mha.py --torch
+MKL_DYNAMIC=FALSE OMP_NUM_THREADS=8 python benchmark_mha.py --torch
+
+python benchmark_mha.py --intra_op_num_threads 1
+python benchmark_mha.py --intra_op_num_threads 2
+python benchmark_mha.py --intra_op_num_threads 4
+python benchmark_mha.py --intra_op_num_threads 8
+
+
+echo "Benchmark performance on CPU with default threads settings:"
+python benchmark_mha.py
+ORT_DISABLE_FLASH_ATTENTION=1 python benchmark_mha.py
+python benchmark_mha.py --torch
+
+python benchmark_mha.py --causal
+python benchmark_mha.py --torch --causal
+
+# Pytorch SDPA does not support causal attention with past state, we only test ORT here.
+python benchmark_mha.py --causal --has_past
+
+cat benchmark_mha_cpu_*.csv > mha_cpu_benchmark_results.csv
diff --git a/onnxruntime/test/python/transformers/test_mha.py b/onnxruntime/test/python/transformers/test_mha.py
index ff473cc2ced92..0fcbd889847e9 100644
--- a/onnxruntime/test/python/transformers/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_mha.py
@@ -10,36 +10,15 @@
 import concurrent.futures
 import itertools
 import unittest
-from enum import IntEnum
 from typing import Dict, List, Optional
 
 import numpy
 import torch
-from benchmark_mha import (
-    InputFormats,
-    MultiHeadAttentionConfig,
-    OrtMultiHeadAttention,
-    create_multi_head_attention_onnx_model,
-)
+from benchmark_mha import InputFormats, MultiHeadAttentionConfig, OrtMultiHeadAttention, SdpaKernel, create_ort_session
 from einops import rearrange
 from parameterized import parameterized
 
 import onnxruntime
-from onnxruntime import InferenceSession
-
-
-class SdpaKernel(IntEnum):
-    """Bit flags for sdpa_kernel CUDA provider option"""
-
-    DEFAULT = 0
-    FLASH_ATTENTION = 1
-    EFFICIENT_ATTENTION = 2
-    TRT_FUSED_ATTENTION = 4
-    CUDNN_FLASH_ATTENTION = 8
-    MATH = 16
-    TRT_FLASH_ATTENTION = 32
-    TRT_CROSS_ATTENTION = 64
-    TRT_CAUSAL_ATTENTION = 128
 
 
 def attention_reference(
@@ -466,7 +445,7 @@ def parity_check_mha_multi_threading(
     test_inputs: List[Dict],
     rtol: float = 1e-3,
     atol: float = 1e-3,
-    sdpa_kernel: int = SdpaKernel.DEFAULT,
+    attention_kernel: int = SdpaKernel.DEFAULT,
     max_threads: int = 5,
     verbose: bool = False,
 ):
@@ -476,21 +455,14 @@ def parity_check_mha_multi_threading(
     if config.causal and config.provider == "CUDAExecutionProvider":
         return None
     # Some kernel does not support certain input format.
-    if sdpa_kernel not in [
+    if attention_kernel not in [
         SdpaKernel.DEFAULT,
         SdpaKernel.FLASH_ATTENTION,
         SdpaKernel.EFFICIENT_ATTENTION,
     ] and config.input_format in [InputFormats.Q_KV_BSNH_BSN2H]:
         return None
-    if verbose:
-        print(f"create a shared session with {vars(config)}")
-    onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=True)
-    if config.provider == "CUDAExecutionProvider":
-        provider_options = {"arena_extend_strategy": "kSameAsRequested", "sdpa_kernel": int(sdpa_kernel)}
-        providers = [(config.provider, provider_options), "CPUExecutionProvider"]
-    else:
-        providers = ["CPUExecutionProvider"]
-    ort_session = InferenceSession(onnx_model_str, providers=providers)
+
+    ort_session = create_ort_session(config, attention_kernel=attention_kernel, use_symbolic_shape=True)
 
     def convert_to_ort_inputs(feed_dict):
         ort_inputs = {}
@@ -613,7 +585,7 @@ def test_mha_cuda(self, config):
     def test_mha_cpu(self, config):
         parity_check_mha(config)
 
-    def run_mha_cuda_multi_threading(self, spda_kernel):
+    def run_mha_cuda_multi_threading(self, attention_kernel):
         for configs in multi_thread_test_cases("CUDAExecutionProvider", comprehensive_mode):
             test_inputs = []
             for config in configs:
@@ -626,8 +598,10 @@ def run_mha_cuda_multi_threading(self, spda_kernel):
                 config.input_format = old_format
                 test_inputs.append({"config": config, "ort_inputs": ort_inputs, "ref_inputs": ref_inputs})
 
-            exception = parity_check_mha_multi_threading(test_inputs, sdpa_kernel=spda_kernel, max_threads=len(configs))
-            assert exception is None, f"{spda_kernel=}, {vars(configs[0])}, {exception}"
+            exception = parity_check_mha_multi_threading(
+                test_inputs, attention_kernel=attention_kernel, max_threads=len(configs)
+            )
+            assert exception is None, f"{attention_kernel=}, {vars(configs[0])}, {exception}"
 
     def test_mha_cuda_multi_threading(self):
         self.run_mha_cuda_multi_threading(SdpaKernel.DEFAULT)

From 5af423c7c0561d3861a6b8ed5598abef02715e28 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 27 Jul 2024 13:22:57 +1000
Subject: [PATCH 32/57] Set version and other info in the C# dll (#21517)

### Description
<!-- Describe your changes. -->
Set version and other info in the Microsoft.ML.OnnxRuntime C# dll by
setting GenerateAssemblyInfo to true and passing in ORT version in the
CI.

Minor re-org of the order of properties so related things are grouped a
little better.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#21475
---
 .../Microsoft.ML.OnnxRuntime.csproj           | 67 +++++++++++--------
 .../azure-pipelines/templates/c-api-cpu.yml   |  4 +-
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 3c8a49bf93578..deb6b4f884bcf 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -23,7 +23,7 @@
   </PropertyGroup>
 
   <PropertyGroup Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training' AND
-                            '$(IncludeMobileTargets)' == 'true'">
+                              '$(IncludeMobileTargets)' == 'true'">
     <MobileTargets>net8.0-android</MobileTargets>
   </PropertyGroup>
 
@@ -31,6 +31,43 @@
     <TargetFrameworks>$(BaseTargets);$(MobileTargets)</TargetFrameworks>
   </PropertyGroup>
 
+  <PropertyGroup>
+    <RootNamespace>Microsoft.ML.OnnxRuntime</RootNamespace>
+    <AssemblyName>Microsoft.ML.OnnxRuntime</AssemblyName>
+
+    <!-- Ignore any passed in value unless this is a release build. -->
+    <PackageVersion Condition="'$(IsReleaseBuild)' != 'true'"></PackageVersion>
+    <PackageVersion Condition="'$(PackageVersion)' == '' And '$(Configuration)' == 'Debug'">1.0.0</PackageVersion>
+    <PackageVersion Condition="'$(PackageVersion)' == '' ">0.0.0</PackageVersion>
+
+    <!-- Set the attributes for the managed dll -->
+    <!-- https://learn.microsoft.com/en-us/dotnet/standard/assembly/set-attributes-project-file -->
+    <GenerateAssemblyInfo>true</GenerateAssemblyInfo>
+    <AssemblyTitle>Microsoft.ML.OnnxRuntime C# Bindings</AssemblyTitle>
+    <Company>Microsoft</Company>
+    <Copyright>© Microsoft Corporation. All rights reserved.</Copyright>
+    <Description>This package contains ONNX Runtime for .Net platforms</Description>
+
+    <!-- NOTE: this is also used as the default for AssemblyVersion and FileVersion -->
+    <Version>$(PackageVersion)</Version>
+
+    <!-- Set the attributes for a nuget package -->
+    <!--- The package name is always hardcoded as the package created by this project only contains managed assemblies -->
+    <!--- The parameter OrtPackageId is only used for some conditional logic below -->
+    <Authors>Microsoft</Authors>
+    <PackageId>Microsoft.ML.OnnxRuntime.Managed</PackageId>
+    <PackageTags>ONNX;ONNX Runtime;Machine Learning</PackageTags>
+    <PackageProjectUrl>https://github.com/Microsoft/onnxruntime</PackageProjectUrl>
+    <PackageLicenseFile>LICENSE.txt</PackageLicenseFile>
+    <PackageIcon>ORT_icon_for_light_bg.png</PackageIcon>
+    <PackageReleaseNotes>
+      Release Def:
+        Branch: $(BUILD_SOURCEBRANCH)
+        Commit: $(BUILD_SOURCEVERSION)
+        Build: https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=$(BUILD_BUILDID)
+    </PackageReleaseNotes>
+  </PropertyGroup>
+
   <PropertyGroup>
     <Platforms>AnyCPU;x86</Platforms>
     <LangVersion>default</LangVersion>
@@ -43,8 +80,6 @@
     <OnnxRuntimeCsharpRoot>$(OnnxRuntimeRoot)\csharp</OnnxRuntimeCsharpRoot>
     <TargetArchitecture Condition=" '$(TargetArchitecture)' == '' ">x64</TargetArchitecture>
 
-    <RootNamespace>Microsoft.ML.OnnxRuntime</RootNamespace>
-    <AssemblyName>Microsoft.ML.OnnxRuntime</AssemblyName>
     <EnableDefaultItems>false</EnableDefaultItems>
     <EnableDefaultCompileItems>false</EnableDefaultCompileItems>
     <DebugType>portable</DebugType>
@@ -54,27 +89,8 @@
     on their device is not built for training, an exception will be thrown with the following message -
     "Training is disabled in the current build. Please build onnxruntime from source with the build flags
     enable_training_apis. "-->
-   <EnableTrainingApis Condition="'$(EnableTrainingApis)' == ''">true</EnableTrainingApis>
+    <EnableTrainingApis Condition="'$(EnableTrainingApis)' == ''">true</EnableTrainingApis>
 
-    <!--- The package name is always hardcoded as the package created by this project only contains managed assemblies -->
-    <!--- The parameter OrtPackageId is only used for some conditional logic below -->
-    <PackageId>Microsoft.ML.OnnxRuntime.Managed</PackageId>
-    <Authors>Microsoft</Authors>
-    <PackageVersion Condition=" '$(PackageVersion)' == '' And '$(Configuration)' == 'Debug' ">1.0.0</PackageVersion>
-    <PackageVersion Condition=" '$(PackageVersion)' == '' ">0.0.0</PackageVersion>
-    <Version>$(PackageVersion)</Version>
-    <Description>This package contains ONNX Runtime for .Net platforms</Description>
-    <PackageTags>ONNX;ONNX Runtime;Machine Learning</PackageTags>
-    <PackageProjectUrl>https://github.com/Microsoft/onnxruntime</PackageProjectUrl>
-    <Copyright>© Microsoft Corporation. All rights reserved.</Copyright>
-    <PackageLicenseFile>LICENSE.txt</PackageLicenseFile>
-    <PackageIcon>ORT_icon_for_light_bg.png</PackageIcon>
-    <PackageReleaseNotes>
-      Release Def:
-        Branch: $(BUILD_SOURCEBRANCH)
-        Commit: $(BUILD_SOURCEVERSION)
-        Build: https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=$(BUILD_BUILDID)
-    </PackageReleaseNotes>
     <!-- sourcelink flags -->
     <PublishRepositoryUrl>true</PublishRepositoryUrl>
 
@@ -82,7 +98,6 @@
     <!--EmbedUntrackedSources>true</EmbedUntrackedSources-->
 
     <GenerateTargetFrameworkAttribute>false</GenerateTargetFrameworkAttribute>
-    <GenerateAssemblyInfo>false</GenerateAssemblyInfo>
     <AllowedOutputExtensionsInPackageBuildOutputFolder>$(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb</AllowedOutputExtensionsInPackageBuildOutputFolder>
     <Configurations>Debug;Release;RelWithDebInfo</Configurations>
 
@@ -158,10 +173,6 @@
     <OrtConstants>$(OrtConstants);__ENABLE_COREML__</OrtConstants>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(IsXamarinTarget)'=='true'">
-    <OrtConstants>$(OrtConstants);__XAMARIN__</OrtConstants>
-  </PropertyGroup>
-
   <PropertyGroup>
     <DefineConstants>$(DefineConstants);$(OrtConstants)</DefineConstants>
   </PropertyGroup>
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 7ba1179e7ad4d..ec97da3786fd9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -364,6 +364,8 @@ stages:
       workingDirectory: '$(Build.BinariesDirectory)/nuget-artifact'
       displayName: 'List artifacts'
 
+    - template: set-version-number-variables-step.yml
+
     # Reconstruct the build dir
     - task: PowerShell@2
       displayName: 'Extract native libraries for addition to nuget native package'
@@ -403,7 +405,7 @@ stages:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:PackageVersion=$(OnnxRuntimeVersion)'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - ${{ if eq(parameters.DoEsrp, true) }}:

From 690d745cbff6f540f95e668be21da76873689a32 Mon Sep 17 00:00:00 2001
From: zz002 <zhenze.wang@amd.com>
Date: Sat, 27 Jul 2024 11:28:55 +0800
Subject: [PATCH 33/57] [VitisAI] 1. KernelDef supports StartVersion and
 EndVersion (#21519)

### Description
<!-- Describe your changes. -->

[VitisAI] 1. KernelDef supports StartVersion and EndVersion
2. CapabilityOps checks domain

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: Zhenze Wang <zhenzew@xilinx.com>
---
 onnxruntime/core/providers/vitisai/imp/capability.cc        | 6 +++++-
 onnxruntime/core/providers/vitisai/imp/global_api.cc        | 4 ++--
 .../core/providers/vitisai/vitisai_execution_provider.cc    | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/capability.cc b/onnxruntime/core/providers/vitisai/imp/capability.cc
index 58522a45a151e..6d188076fe613 100644
--- a/onnxruntime/core/providers/vitisai/imp/capability.cc
+++ b/onnxruntime/core/providers/vitisai/imp/capability.cc
@@ -51,7 +51,11 @@ GetComputeCapabilityOps(const onnxruntime::GraphViewer& graph,
 
   std::vector<NodeIndex> node_indexs = graph.GetNodesInTopologicalOrder();
   node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_nodes_included_eps.count(index) > 0; }), node_indexs.end());
-  node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_support_optypes_by_eps.count(graph.GetNode(index)->OpType()) == 0; }), node_indexs.end());
+  node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(),
+                                   [&](NodeIndex index) {
+                                    auto node = graph.GetNode(index);
+                                    return all_support_optypes_by_eps.count(node->Domain() + ":" + node->OpType()) == 0; }),
+                    node_indexs.end());
 
   std::vector<std::unique_ptr<ComputeCapability>> result;
   for (auto& n : node_indexs) {
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 8c1dce0d3dc1a..a86a4fb61d54d 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -173,7 +173,7 @@ void create_kernel_registry(std::vector<OrtCustomOpDomain*> domains) {
       auto def_builder = KernelDefBuilder::Create();
       def_builder->SetName(op->GetName(op));
       def_builder->SetDomain(domain->domain_.c_str());
-      def_builder->SinceVersion(1);
+      def_builder->SinceVersion(op->GetStartVersion(op), op->GetEndVersion(op));
       if (op->version > 12) {
         auto input_count = op->GetInputTypeCount(op);
         for (auto i = 0u; i < input_count; i++) {
@@ -183,7 +183,7 @@ void create_kernel_registry(std::vector<OrtCustomOpDomain*> domains) {
       def_builder->Provider(onnxruntime::kVitisAIExecutionProvider);
       KernelCreateFn kernel_create_fn =
           [op](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status {
-        // out = std::make_unique<MyCustomOpKernel>(info, *op);
+        out = std::make_unique<MyCustomOpKernel>(info, *op);
         return Status::OK();
       };
       std::ignore = s_kernel_registry_vitisaiep->Register(KernelCreateInfo(def_builder->Build(), kernel_create_fn));
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 0f0972d96bcee..58fef537535d2 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -44,7 +44,7 @@ VitisAIExecutionProvider::VitisAIExecutionProvider(
 void VitisAIExecutionProvider::CreateKernelRegistry() {
   for (const auto& domain : get_domains_vitisaiep()) {
     for (const auto* op : domain->custom_ops_) {
-      vitisai_optypes_.insert(op->GetName(op));
+      vitisai_optypes_.insert(domain->domain_ + ":" + op->GetName(op));
     }
   }
 }

From d01fc75ef161a624c4275f89cb068cc1c79d9392 Mon Sep 17 00:00:00 2001
From: Yueqing Zhang <yuz75@Pitt.edu>
Date: Fri, 26 Jul 2024 22:15:57 -0700
Subject: [PATCH 34/57] [VitisAI] support vaip create ep context nodes & bug
 fix (#21506)

### Description
<!-- Describe your changes. -->
1. We decided to move the context node creation back to our own repo because it is more flexible to modify.
2. We found a bug related the context node. It would change the inference order. So, we fixed in this PR as well.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This is crucial for Microsoft Release next month.

---------

Co-authored-by: Yueqing Zhang <yueqingz@amd.com>
---
 .../shared_library/provider_interfaces.h      |  1 +
 .../shared_library/provider_wrappedtypes.h    |  1 +
 .../core/providers/vitisai/imp/global_api.cc  | 50 +++++++++++++++++++
 .../vitisai/include/vaip/custom_op.h          | 11 ++++
 .../vitisai/include/vaip/global_api.h         |  6 ++-
 .../vitisai/include/vaip/vaip_ort_api.h       | 11 ++--
 .../vitisai/vitisai_execution_provider.cc     | 14 ++++--
 .../core/session/provider_bridge_ort.cc       |  1 +
 8 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 382b3ac932520..a9394838aa784 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -388,6 +388,7 @@ struct ProviderHost {
   virtual ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) = 0;
 
   // GraphProto
+  virtual std::unique_ptr<ONNX_NAMESPACE::GraphProto> GraphProto__construct() = 0;
   virtual void GraphProto__operator_delete(ONNX_NAMESPACE::GraphProto* p) = 0;
   virtual void GraphProto__operator_assign(ONNX_NAMESPACE::GraphProto* p, const ONNX_NAMESPACE::GraphProto& v) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index de6c1da1d6430..242c7126f3274 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -146,6 +146,7 @@ struct AttributeProto final {
 };
 
 struct GraphProto final {
+  static std::unique_ptr<GraphProto> Create() { return g_host->GraphProto__construct(); }
   static void operator delete(void* p) { g_host->GraphProto__operator_delete(reinterpret_cast<GraphProto*>(p)); }
   void operator=(const GraphProto& v) { return g_host->GraphProto__operator_assign(this, v); }
 
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index a86a4fb61d54d..df47fa5cee4ab 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -55,10 +55,15 @@ struct OrtVitisAIEpAPI {
   uint32_t (*vaip_get_version)();
   void (*get_backend_compilation_cache)(const std::string& model_path, const onnxruntime::Graph& graph, const char* json_config, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data);
   void (*restore_backend_compilation_cache)(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path);
+  void (*create_ep_context_nodes)(
+      onnxruntime::Graph& ep_context_graph,
+      const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
+      vaip_core::DllSafe<std::vector<Node*>>* ret_value) = nullptr;
   void Ensure() {
     if (handle_)
       return;
     auto& env = Provider_GetHost()->Env__Default();
+    auto& logger = *Provider_GetHost()->LoggingManager_GetDefaultLogger();
 #ifdef _WIN32
     // this dll is already linked to the executable, normally a test program
     handle_ = reinterpret_cast<void*>(GetModuleHandle(TEXT("onnxruntime_vitisai_ep.dll")));
@@ -81,6 +86,10 @@ struct OrtVitisAIEpAPI {
                                            (void**)&vaip_get_version);
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "get_compilation_cache", (void**)&get_backend_compilation_cache));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "restore_compilation_cache", (void**)&restore_backend_compilation_cache));
+    status1 = (env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes));
+    if (!status1.IsOK()) {
+      LOGS(logger, WARNING) << "create_ep_context_nodes is not defined, please upgrade onnxruntime_vitisai_ep.dll. However, it still works.";
+    }
   }
 
  private:
@@ -146,6 +155,24 @@ void restore_backend_compilation_cache(const std::string& cache_dir, const std::
   s_library_vitisaiep.restore_backend_compilation_cache(cache_dir, cache_key, cache_data, model_path);
 }
 
+bool has_create_ep_context_nodes() {
+  return s_library_vitisaiep.create_ep_context_nodes != nullptr;
+}
+
+std::optional<std::vector<Node*>> create_ep_context_nodes(
+    onnxruntime::Graph& ep_context_graph,
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps) {
+  if (s_library_vitisaiep.create_ep_context_nodes) {
+    vaip_core::DllSafe<std::vector<Node*>> nodes;
+    s_library_vitisaiep.create_ep_context_nodes(ep_context_graph, eps, &nodes);
+    if (nodes.get()) {
+      auto ret = std::vector<Node*>(*nodes);
+      return ret;
+    }
+  }
+  return std::nullopt;
+}
+
 struct MyCustomOpKernel : OpKernel {
   MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) {
     op_kernel_ =
@@ -405,6 +432,29 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     graph.AddInitializedTensor(tensor);
   };
 
+  the_global_api.get_model_path = [](const Graph& graph) -> const std::filesystem::path& {
+    return graph.ModelPath();
+  };
+
+  the_global_api.create_empty_model = [](const std::filesystem::path& path, const std::vector<std::pair<std::string, int64_t>>& opset) -> Model* {
+    auto model_proto = ONNX_NAMESPACE::ModelProto::Create();
+    auto graph_proto = ONNX_NAMESPACE::GraphProto::Create();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    for (const auto& op : opset) {
+      auto* opset_import = model_proto->add_opset_import();
+      *(opset_import->mutable_domain()) = op.first;
+      opset_import->set_version(op.second);
+    }
+    std::ignore = model_proto->mutable_graph();  // create a graph
+    auto& logger = logging::LoggingManager::DefaultLogger();
+    auto model = Model::Create(std::move(*model_proto), path, nullptr, logger);
+    return model.release();
+  };
+
+  the_global_api.graph_set_inputs = [](Graph& graph, gsl::span<const NodeArg* const> inputs) {
+    graph.SetInputs(inputs);
+  };
+
   if (!s_library_vitisaiep.vaip_get_version) {
     return reinterpret_cast<vaip_core::OrtApiForVaip*>(&(the_global_api.host_));
   } else {
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
index d34f7095b704d..5d020e00ff5b7 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h
@@ -26,6 +26,17 @@ class ExecutionProvider {
   virtual DllSafe<std::vector<std::string>>
   get_meta_def_constant_initializer() const = 0;
   virtual std::unique_ptr<CustomOp> compile() const = 0;
+
+ public:
+  inline void set_fused_node(const onnxruntime::Node* fused_node) {
+    fused_node_ = fused_node;
+  }
+  inline const onnxruntime::Node* get_fused_node() const {
+    return fused_node_;
+  }
+
+ private:
+  const onnxruntime::Node* fused_node_ = nullptr;
 };
 
 class CustomOp {
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index 3fdbc60bb0ee6..ae2a513a98e32 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -9,10 +9,14 @@
 #include "vaip/my_ort.h"
 #include "vaip/dll_safe.h"
 #include "vaip/custom_op.h"
-
+#include <optional>
 void initialize_vitisai_ep();
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options);
 std::shared_ptr<onnxruntime::KernelRegistry> get_kernel_registry_vitisaiep();
 const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep();
 void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data);
 void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path);
+std::optional<std::vector<onnxruntime::Node*>> create_ep_context_nodes(
+    onnxruntime::Graph& ep_context_graph,
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps);
+bool has_create_ep_context_nodes();
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index 3346739890484..e6aacfe1f0272 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -8,12 +8,13 @@
 #include <cassert>
 #include <functional>
 #include <vector>
+#include <filesystem>
 struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (3u)
-#define VAIP_ORT_API_MINOR (1u)
+#define VAIP_ORT_API_MAJOR (4u)
+#define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
   uint32_t magic;  // 'VAIP' or something else to make sure the following field
@@ -222,7 +223,11 @@ struct OrtApiForVaip {
       const std::vector<int16_t>& data);  // [88]
   TensorProto* (*tensor_proto_new_bf16)(
       const std::string& name, const std::vector<int64_t>& shape,
-      const std::vector<int16_t>& data);  // [89]
+      const std::vector<int16_t>& data);                                                                                       // [89]
+  const std::filesystem::path& (*get_model_path)(const Graph& graph);                                                          // [90]
+  Model* (*create_empty_model)(const std::filesystem::path& path, const std::vector<std::pair<std::string, int64_t>>& opset);  //[91]
+  void (*graph_set_inputs)(Graph& graph,
+                           gsl::span<const NodeArg* const> inputs);  // [92]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 58fef537535d2..756bda2199e89 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -58,8 +58,15 @@ const InlinedVector<const Node*> VitisAIExecutionProvider::GetEpContextNodes() c
   // All preconditions are supposed to have happened.
   if (p_ep_ctx_model_) {
     auto& graph = p_ep_ctx_model_->MainGraph();
-    for (const auto* p_node : graph.Nodes()) {
-      ep_context_node_ptrs.push_back(p_node);
+    if (has_create_ep_context_nodes()) {
+      auto nodes = create_ep_context_nodes(graph, **execution_providers_);
+      if (nodes.has_value()) {
+        ep_context_node_ptrs.assign(nodes->begin(), nodes->end());
+      }
+    } else {
+      for (const auto* p_node : graph.Nodes()) {
+        ep_context_node_ptrs.push_back(p_node);
+      }
     }
   }
   return ep_context_node_ptrs;
@@ -187,6 +194,7 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
     auto& attrs = fused_node_graph.fused_node.get().GetAttributes();
     assert(attrs.count("index"));
     size_t index = attrs.at("index").i();
+    (**this->execution_providers_)[index]->set_fused_node(&fused_node_graph.fused_node.get());
     compute_info.create_state_func = [this, index](ComputeContext* context, FunctionState* state) {
       auto* p = (**this->execution_providers_)[index]->compile().release();
       *state = p;
@@ -204,7 +212,7 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
     };
     node_compute_funcs.push_back(compute_info);
   }
-  if (ep_ctx_enabled_ && p_ep_ctx_model_) {
+  if (ep_ctx_enabled_ && p_ep_ctx_model_ && !has_create_ep_context_nodes()) {
     FulfillEPContextEnablement(fused_nodes_and_graphs);
   }
   return Status::OK();
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 924158a26b927..62e9cfe24c367 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -496,6 +496,7 @@ struct ProviderHostImpl : ProviderHost {
   ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) override { return p->add_tensors(); }
 
   // GraphProto (wrapped)
+  std::unique_ptr<ONNX_NAMESPACE::GraphProto> GraphProto__construct() override { return std::make_unique<ONNX_NAMESPACE::GraphProto>(); }
   void GraphProto__operator_delete(ONNX_NAMESPACE::GraphProto* p) override { delete p; }
 
   const ONNX_NAMESPACE::ValueInfoProto& GraphProto__input(const ONNX_NAMESPACE::GraphProto* p, int index) override { return p->input(index); }

From 10b4a3b90bd61fcda8aefecf2a1dce1a45c086e1 Mon Sep 17 00:00:00 2001
From: maggie1059 <34173352+maggie1059@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:26:38 -0700
Subject: [PATCH 35/57] Fix conda failure for onnxruntime-directml (#21526)

The change in #21005 works for directly building wheels with `build.py`,
but ort-nightly-directml wheels, as well as the 1.18.1 release of the
onnxruntime-directml python wheel, still do not work with conda since
they're built from the `py-win-gpu.yml` pipeline, which uses
`install_third_party_deps.ps1` to set compile flags.
---
 tools/ci_build/github/windows/install_third_party_deps.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/windows/install_third_party_deps.ps1 b/tools/ci_build/github/windows/install_third_party_deps.ps1
index 07679006fb343..168df90188791 100644
--- a/tools/ci_build/github/windows/install_third_party_deps.ps1
+++ b/tools/ci_build/github/windows/install_third_party_deps.ps1
@@ -27,7 +27,7 @@ $Env:CMAKE_PREFIX_PATH = "$install_prefix"
 New-Item -Path "$install_prefix" -ItemType Directory -Force
 
 # Setup compile flags
-$compile_flags = @('/MP', '/guard:cf', '/DWIN32', '/D_WINDOWS', '/DWINVER=0x0A00', '/D_WIN32_WINNT=0x0A00', '/DNTDDI_VERSION=0x0A000000', '/W3')
+$compile_flags = @('/MP', '/guard:cf', '/DWIN32', '/D_WINDOWS', '/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR', '/DWINVER=0x0A00', '/D_WIN32_WINNT=0x0A00', '/DNTDDI_VERSION=0x0A000000', '/W3')
 $linker_flags=@('/guard:cf')
 
 if ($use_cache) {

From 1ce160883f964509a547458c484d2449bda047ae Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:31:16 -0700
Subject: [PATCH 36/57] Bump Sixlabors.ImageSharp from 2.1.8 to 2.1.9 in
 /csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample (#21444)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [Sixlabors.ImageSharp](https://github.com/SixLabors/ImageSharp)
from 2.1.8 to 2.1.9.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/SixLabors/ImageSharp/releases">Sixlabors.ImageSharp's
releases</a>.</em></p>
<blockquote>
<h2>v2.1.9</h2>
<h2>What's Changed</h2>
<ul>
<li>[2.1] Fix overflow in MemoryAllocator.Create(options) by <a
href="https://github.com/antonfirsov"><code>@​antonfirsov</code></a> in
<a
href="https://redirect.github.com/SixLabors/ImageSharp/pull/2732">SixLabors/ImageSharp#2732</a></li>
<li>Backport GIF LZW fix to 2.1 by <a
href="https://github.com/antonfirsov"><code>@​antonfirsov</code></a> in
<a
href="https://redirect.github.com/SixLabors/ImageSharp/pull/2756">SixLabors/ImageSharp#2756</a></li>
<li>Backport 2759 to 2.1.x by <a
href="https://github.com/antonfirsov"><code>@​antonfirsov</code></a> in
<a
href="https://redirect.github.com/SixLabors/ImageSharp/pull/2770">SixLabors/ImageSharp#2770</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/SixLabors/ImageSharp/compare/v2.1.8...v2.1.9">https://github.com/SixLabors/ImageSharp/compare/v2.1.8...v2.1.9</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/9816ca45016c5d3859986f3c600e8934bc450a56"><code>9816ca4</code></a>
Merge pull request <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2770">#2770</a>
from SixLabors/af/backport-2759-2.1.x</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/b33d666ab725c8ae14f38c98ee5dfc4645753b16"><code>b33d666</code></a>
handle DecodingMode</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/6b2030b54927b09ed65f782c994d5c9faa7cef27"><code>6b2030b</code></a>
Merge branch 'release/2.1.x' into af/backport-2759-2.1.x</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/8ffad3f480ebe8c5b432bb24fe8377096eeb733b"><code>8ffad3f</code></a>
Issue2012BadMinCode should decode now</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/1f5bf23b9e81f2dbeb51ed54f13cb3da94e67b6f"><code>1f5bf23</code></a>
skip Issue2758_DecodeWorks</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/3bf8c572a0d82f18e005bf9882106552218a2c37"><code>3bf8c57</code></a>
manual port of 3.1 gif decoder</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/28c20ded87e2d81477a08a48e0d3a0717b3c4d5a"><code>28c20de</code></a>
Clamp JPEG quality estimation results.</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/4b910e7f8400d89f1845761650cf64df687e73d5"><code>4b910e7</code></a>
Decode LZW row by row</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/a1f287977139109a987065643b8172c748abdadb"><code>a1f2879</code></a>
Merge pull request <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2756">#2756</a>
from SixLabors/af/git-av-2.1</li>
<li><a
href="https://github.com/SixLabors/ImageSharp/commit/898df7f8ca51b2163cff0d697e2be44682266f0c"><code>898df7f</code></a>
backport <a
href="https://redirect.github.com/SixLabors/ImageSharp/issues/2749">#2749</a>
to 2.1</li>
<li>Additional commits viewable in <a
href="https://github.com/SixLabors/ImageSharp/compare/v2.1.8...v2.1.9">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Sixlabors.ImageSharp&package-manager=nuget&previous-version=2.1.8&new-version=2.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
index 647c0bbe6a242..29fc9f3bc382f 100644
--- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
+++ b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj
@@ -8,7 +8,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.16.3" />
-    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.8" />
+    <PackageReference Include="Sixlabors.ImageSharp" Version="2.1.9" />
   </ItemGroup>
 
 </Project>

From 48fb8a7e56a7263a8405dc644756eb5c55560352 Mon Sep 17 00:00:00 2001
From: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com>
Date: Sat, 27 Jul 2024 11:10:52 -0700
Subject: [PATCH 37/57] Security fuzz address sanitizer fix Bug #2 and #3
 (#21528)

### Description
Security fuzz test with address sanitizer found several bugs
---
 onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc | 2 ++
 onnxruntime/core/optimizer/attention_fusion.cc           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
index 34a1da99316a2..030cdb1e1b17f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
@@ -143,6 +143,8 @@ Status GptSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_inputs,
 
   // Past state shape is like (2, batch_size, num_heads, past_seq_len, hidden_size/num_heads).
   const ONNX_NAMESPACE::TensorShapeProto* past_shape = subgraph_inputs[3]->Shape();
+  ORT_RETURN_IF(past_shape == nullptr,
+                "subgraph past state cannot be nullptr");
   ORT_RETURN_IF(past_shape->dim_size() != 5,
                 "subgraph past state is expected to have 5 dimension, got ", past_shape->dim_size());
 
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index 08066f030a381..64a38214caff0 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -210,7 +210,7 @@ Status AttentionFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     if ((node.GetOutputEdgesCount() >= 2 && node.GetOutputEdgesCount() <= 6) &&  // Add node.GetOutputEdgesCount() == 5/6 for distilbert
         graph_utils::IsSupportedOptypeVersionAndDomain(node, "LayerNormalization", {1, 17}, kOnnxDomain) &&
-        graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) {
+        graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders()) && node.InputDefs().size() > 2) {
       // Get hidden size from layer norm bias tensor shape.
       const NodeArg& layer_norm_bias = *(node.InputDefs()[2]);
       if (!optimizer_utils::IsShapeKnownOnAllDims(layer_norm_bias, 1)) {

From 82b2955268e14f26eb71ad2d660452ab8db454d7 Mon Sep 17 00:00:00 2001
From: Ranjit Ranjan <165394499+ranjitshs@users.noreply.github.com>
Date: Sat, 27 Jul 2024 23:47:22 +0530
Subject: [PATCH 38/57] [AIX]test failure fix using gtest-1.15.0 for AIX
 (#21497)

### Description
Local CI setup for AIX reported tests failure after the gtest 1.15.0
upgrade.

### Motivation and Context
Below tests failure is observed after gtest upgrade.

The following tests FAILED:
	  1 - onnxruntime_test_all (ILLEGAL)
	  7 - onnxruntime_logging_apis_test (Subprocess aborted)

To fix this, I am enabling pthread support under gtest. This was
disabled with previous version of gtest for some reason.
Now by enabling this, above tests are getting passed with gtest 1.15.0.
---
 cmake/external/onnxruntime_external_deps.cmake | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 14e6ed515fd6e..775576a771529 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -46,9 +46,6 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(gtest_disable_pthreads ON)
   endif()
-  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-    set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE)
-  endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
   if (IOS OR ANDROID)
     # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing

From 7e23212de9746ed2452061958f8aae3ffc171cee Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sat, 27 Jul 2024 15:58:12 -0700
Subject: [PATCH 39/57] Delete
 tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml (#21529)

### Description
Delete tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml


### Motivation and Context
This CI pipeline has been divided into 4 different pipeline.
---
 .../azure-pipelines/win-gpu-ci-pipeline.yml   | 125 ------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml

diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
deleted file mode 100644
index c5262880c4c55..0000000000000
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ /dev/null
@@ -1,125 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-trigger:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-pr:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-parameters:
-- name: CudaVersion
-  displayName: CUDA version
-  type: string
-  default: '12.2'
-  values:
-    - 11.8
-    - 12.2
-- name: RunOnnxRuntimeTests
-  displayName: Run Tests?
-  type: boolean
-  default: true
-
-stages:
-- stage: cuda
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        additionalBuildFlags: >-
-          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --enable_cuda_profiling --enable_transformers_tool_test
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
-          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: CUDA
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
-
-- stage: training
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        additionalBuildFlags: >-
-          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --skip_onnx_tests
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: CUDA
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
-        isTraining: true
-
-- stage: dml
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env.bat
-        buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: DML
-        WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
-
-- stage: kernelDocumentation
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: >-
-          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
-          --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: false
-        GenerateDocumentation: true
-        ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation.
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10

From a4d3a1ce0c18e1d1b31a9cc0b45beba290ee114c Mon Sep 17 00:00:00 2001
From: liqun Fu <liqfu@microsoft.com>
Date: Sat, 27 Jul 2024 15:58:36 -0700
Subject: [PATCH 40/57] pick changes from
 https://github.com/onnx/onnx/pull/6195 to fix heap-buffer-overflow in
 onnx::convPoolShapeInference (#21507)

### Description
onnx 1.16.2 is not available before ort 1.19.0 code freeze. Thus pick
the needed change as patch
---
 cmake/patches/onnx/onnx.patch                 | 383 ++++++++++++++++++
 .../providers/cpu/generator/random_test.cc    |   8 +-
 .../core/graph/training_op_defs.cc            | 104 +++--
 3 files changed, 447 insertions(+), 48 deletions(-)

diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index 162d33581a5ca..6ac3555eeecf1 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -86,3 +86,386 @@ index 0aab3e26..398ac2d6 100644
 +#endif
 +
  #endif // ! ONNX_ONNX_PB_H
+diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc
+index c315a2a7..58963154 100644
+--- a/onnx/defs/math/defs.cc
++++ b/onnx/defs/math/defs.cc
+@@ -3472,6 +3472,9 @@ ONNX_OPERATOR_SET_SCHEMA(
+           }
+
+           auto& input_shape = getInputShape(ctx, 0);
++          if (input_shape.dim_size() < 2) {
++            fail_shape_inference("First input should have at least 2 dimensions in ", ctx.getDisplayName(), ".");
++          }
+           auto signal_dim = input_shape.dim(1);
+           if (!signal_dim.has_dim_value()) {
+             return;
+diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc
+index be6a851d..fad595d0 100644
+--- a/onnx/defs/nn/defs.cc
++++ b/onnx/defs/nn/defs.cc
+@@ -126,6 +126,9 @@ void convPoolShapeInference(
+             residual -= stride;
+           }
+         }
++        if (i >= static_cast<int>(effective_kernel_shape.size())) {
++          fail_shape_inference("kernel shape should have ", input_dims_size, " values in ", ctx.getDisplayName(), ".");
++        }
+         int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual;
+         if (total_pad < 0)
+           total_pad = 0;
+@@ -959,19 +962,21 @@ ONNX_OPERATOR_SET_SCHEMA(
+           auto w_type = ctx.getInputType(3);
+           if (nullptr == x_type || nullptr == w_type || x_type->value_case() != TypeProto::kTensorType ||
+               w_type->value_case() != TypeProto::kTensorType) {
+-            fail_type_inference("inputs are expected to have tensor type.");
++            fail_type_inference("inputs are expected to have tensor type in ", ctx.getDisplayName(), ".");
+           }
+
+           auto x_zero_point_type = ctx.getInputType(2);
+           if (nullptr == x_zero_point_type ||
+               x_zero_point_type->tensor_type().elem_type() != x_type->tensor_type().elem_type()) {
+-            fail_type_inference("input and zero_point pair is expected to have be same type.");
++            fail_type_inference(
++                "input and zero_point pair is expected to have be same type in ", ctx.getDisplayName(), ".");
+           }
+
+           auto w_zero_point_type = ctx.getInputType(5);
+           if (nullptr == w_zero_point_type ||
+               w_zero_point_type->tensor_type().elem_type() != w_type->tensor_type().elem_type()) {
+-            fail_type_inference("weight and zero_point pair is expected to have same type.");
++            fail_type_inference(
++                "weight and zero_point pair is expected to have same type in ", ctx.getDisplayName(), ".");
+           }
+
+           propagateElemTypeFromInputToOutput(ctx, 7, 0);
+@@ -2647,7 +2652,8 @@ ONNX_OPERATOR_SET_SCHEMA(
+           if (!hasNInputShapes(ctx, 1)) {
+             return;
+           }
+-          auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
++
++          auto& input_shape = getInputShape(ctx, 0);
+           int64_t input_ndim = input_shape.dim_size();
+           int64_t axis = -1;
+           auto axis_proto = ctx.getAttribute("axis");
+@@ -2659,7 +2665,16 @@ ONNX_OPERATOR_SET_SCHEMA(
+             // positive value.
+             axis += input_ndim;
+           }
+-
++          if (axis < 0) {
++            fail_shape_inference(
++                "Unexpected axis value (",
++                axis,
++                ") rank of first input is ",
++                input_ndim,
++                " in ",
++                ctx.getDisplayName(),
++                ".");
++          }
+           if (ctx.getNumOutputs() > 1) {
+             auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+             mean_shape->CopyFrom(input_shape);
+diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc
+index 57f8e2a4..8b2dc07f 100644
+--- a/onnx/defs/nn/old.cc
++++ b/onnx/defs/nn/old.cc
+@@ -201,6 +201,9 @@ void convPoolShapeInference_opset19(
+             residual -= stride;
+           }
+         }
++        if (i >= static_cast<int>(effective_kernel_shape.size())) {
++          fail_shape_inference("kernel shape should have ", input_dims_size, " values in ", ctx.getDisplayName(), ".");
++        }
+         int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual;
+         if (total_pad < 0)
+           total_pad = 0;
+diff --git a/onnx/defs/shape_inference.h b/onnx/defs/shape_inference.h
+index a80473b3..d1bcd401 100644
+--- a/onnx/defs/shape_inference.h
++++ b/onnx/defs/shape_inference.h
+@@ -105,6 +105,10 @@ struct InferenceContext {
+   virtual const SparseTensorProto* getInputSparseData(size_t index) const = 0;
+   // Gets the shape inputs computed by partial data propagation.
+   virtual const TensorShapeProto* getSymbolicInput(size_t index) const = 0;
++  // To display a name the user can use to narrow its search.
++  virtual std::string getDisplayName() const {
++    return "";
++  }
+ };
+
+ // We use data propagation to perform partial evaluation of the model, to compute statically
+@@ -263,7 +267,15 @@ inline void propagateElemTypeFromDtypeToOutput(
+   } else {
+     // This is not expected to happen
+     fail_type_inference(
+-        "Output ", outputIndex, " expected to have: ", expected_value_case, " or UNDEFINED. Got: ", output_value_case);
++        "Output ",
++        outputIndex,
++        " expected to have: ",
++        expected_value_case,
++        " or UNDEFINED. Got: ",
++        output_value_case,
++        " in ",
++        ctx.getDisplayName(),
++        ".");
+   }
+ }
+
+@@ -277,18 +289,18 @@ inline void propagateElemTypeFromDtypeToOutput(InferenceContext& ctx, const Attr
+   const auto attr_type = attr->type();
+   if (attr_type == AttributeProto::TENSOR) {
+     if (attr->t().dims().size() != 1) {
+-      fail_type_inference("Attribute expected to have a one-dim tensor");
++      fail_type_inference("Attribute expected to have a one-dim tensor in ", ctx.getDisplayName(), ".");
+     }
+     data_type = attr->t().data_type();
+     expected_value_case = TypeProto::kTensorType;
+   } else if (attr_type == AttributeProto::SPARSE_TENSOR) {
+     if (attr->sparse_tensor().dims().size() != 1) {
+-      fail_type_inference("Attribute expected to have a one-dim sparse tensor");
++      fail_type_inference("Attribute expected to have a one-dim sparse tensor in ", ctx.getDisplayName(), ".");
+     }
+     data_type = attr->sparse_tensor().values().data_type();
+     expected_value_case = TypeProto::kSparseTensorType;
+   } else {
+-    fail_type_inference("Attribute expected to have tensor or sparse tensor type");
++    fail_type_inference("Attribute expected to have tensor or sparse tensor type in ", ctx.getDisplayName(), ".");
+   }
+
+   propagateElemTypeFromDtypeToOutput(ctx, data_type, outputIndex, expected_value_case);
+@@ -326,7 +338,10 @@ inline const TensorShapeProto& getInputShape(const InferenceContext& ctx, size_t
+   const auto* input_type = ctx.getInputType(n);
+   const auto value_case = input_type->value_case();
+   if (value_case != TypeProto::kTensorType && value_case != TypeProto::kSparseTensorType) {
+-    fail_type_inference("Attribute expected to have tensor or sparse tensor type");
++    fail_type_inference("Input ", n, "expected to be a tensor or a sparse tensor type in ", ctx.getDisplayName(), ".");
++  }
++  if (!hasShape(*input_type)) {
++    fail_shape_inference("Input ", n, " must have a non null shape in ", ctx.getDisplayName(), ".");
+   }
+   if (value_case == TypeProto::kTensorType) {
+     return input_type->tensor_type().shape();
+@@ -344,7 +359,7 @@ inline const TensorShapeProto* getOptionalInputShape(InferenceContext& ctx, size
+
+   const auto value_case = input_type->value_case();
+   if (value_case != TypeProto::kTensorType && value_case != TypeProto::kSparseTensorType) {
+-    fail_type_inference("Attribute expected to have tensor or sparse tensor type");
++    fail_type_inference("Input ", n, "expected to be a tensor or a sparse tensor type in ", ctx.getDisplayName(), ".");
+   }
+   if (value_case == TypeProto::kTensorType) {
+     return &input_type->tensor_type().shape();
+@@ -372,7 +387,10 @@ inline void appendSingleDimCopiedFromInputTypeToOutputType(
+         " does not match type of output: ",
+         outputIndex,
+         "type: ",
+-        output_value_case);
++        output_value_case,
++        " in ",
++        ctx.getDisplayName(),
++        ".");
+   }
+   if (TypeProto::kTensorType == input_value_case) {
+     auto* dim = output_type->mutable_tensor_type()->mutable_shape()->add_dim();
+@@ -382,7 +400,13 @@ inline void appendSingleDimCopiedFromInputTypeToOutputType(
+     *dim = input_type->sparse_tensor_type().shape().dim(static_cast<int>(fromDimIndex));
+   } else {
+     fail_type_inference(
+-        "Input ", inputIndex, " and Output ", outputIndex, " expected to have tensor or sparse tensor type");
++        "Input ",
++        inputIndex,
++        " and Output ",
++        outputIndex,
++        " expected to have tensor or sparse tensor type in ",
++        ctx.getDisplayName(),
++        ".");
+   }
+ }
+
+@@ -440,7 +464,14 @@ updateOutputElemType(InferenceContext& ctx, size_t outputIndex, int32_t elemType
+     setTensorElementType(elemType, expected_type, *output_type);
+   } else {
+     // This is not expected to happen
+-    fail_type_inference("Output ", outputIndex, " expected to have tensor or sparse tensor type: ", expected_type);
++    fail_type_inference(
++        "Output ",
++        outputIndex,
++        " expected to have tensor or sparse tensor type: ",
++        expected_type,
++        " in ",
++        ctx.getDisplayName(),
++        ".");
+   }
+ }
+
+@@ -462,16 +493,17 @@ inline void propagateElemTypeFromAttributeToOutput(
+       updateOutputElemType(ctx, outputIndex, default_value, expected_type);
+       return;
+     } else {
+-      fail_type_inference("Value of attribute ", attributeName, " not specified");
++      fail_type_inference("Value of attribute ", attributeName, " not specified in ", ctx.getDisplayName(), ".");
+     }
+   }
+   if (!attr_proto->has_i()) {
+-    fail_type_inference("Attribute ", attributeName, " should be of integer type and specify a type.");
++    fail_type_inference(
++        "Attribute ", attributeName, " should be of integer type and specify a type in ", ctx.getDisplayName(), ".");
+   }
+   auto attr_value = attr_proto->i();
+   auto elem_type = static_cast<TensorProto_DataType>(attr_value);
+   if (!TensorProto_DataType_IsValid(elem_type)) {
+-    fail_type_inference("Attribute ", attributeName, " does not specify a valid type.");
++    fail_type_inference("Attribute ", attributeName, " does not specify a valid type in ", ctx.getDisplayName(), ".");
+   }
+   updateOutputElemType(ctx, outputIndex, elem_type, expected_type);
+ }
+@@ -497,7 +529,7 @@ inline TensorShapeProto*
+ getOutputShape(InferenceContext& ctx, size_t n, TypeProto::ValueCase default_type = TypeProto::kTensorType) {
+   auto output_type = ctx.getOutputType(n);
+   if (output_type == nullptr) {
+-    fail_type_inference("Output ", n, " expected to have tensor or sparse type");
++    fail_type_inference("Output ", n, " expected to have tensor or sparse type in ", ctx.getDisplayName(), ".");
+   }
+   const auto output_value_case = output_type->value_case();
+   if (output_value_case == TypeProto::kTensorType || output_value_case == TypeProto::kSparseTensorType) {
+@@ -505,7 +537,7 @@ getOutputShape(InferenceContext& ctx, size_t n, TypeProto::ValueCase default_typ
+   } else if (output_value_case == TypeProto::VALUE_NOT_SET) {
+     return getTensorMutableShape(default_type, *output_type);
+   } else {
+-    fail_type_inference("Output ", n, " expected to have tensor type");
++    fail_type_inference("Output ", n, " expected to have tensor type in ", ctx.getDisplayName(), ".");
+   }
+ }
+
+@@ -562,13 +594,13 @@ inline void propagateShapeFromAttributeToOutput(
+   auto attr_proto = ctx.getAttribute(attributeName);
+   if ((nullptr == attr_proto) || (!attr_proto->has_type()) ||
+       (attr_proto->type() != AttributeProto_AttributeType_INTS)) {
+-    fail_shape_inference("Attribute ", attributeName, " should specify a shape");
++    fail_shape_inference("Attribute ", attributeName, " should specify a shape in ", ctx.getDisplayName(), ".");
+   }
+   auto& int_list = attr_proto->ints();
+   TensorShapeProto shape;
+   for (auto dim_size : int_list) {
+     if (dim_size < 0) {
+-      fail_shape_inference("Negative values are not allowed in a shape specification");
++      fail_shape_inference("Negative values are not allowed in a shape specification in ", ctx.getDisplayName(), ".");
+     }
+     shape.add_dim()->set_dim_value(dim_size);
+   }
+@@ -745,7 +777,16 @@ inline void checkInputRank(InferenceContext& ctx, size_t input_index, int expect
+   if (hasInputShape(ctx, input_index)) {
+     auto rank = getInputShape(ctx, input_index).dim_size();
+     if (rank != expected_rank) {
+-      fail_shape_inference("Input ", input_index, " expected to have rank ", expected_rank, " but has rank ", rank);
++      fail_shape_inference(
++          "Input ",
++          input_index,
++          " expected to have rank ",
++          expected_rank,
++          " but has rank ",
++          rank,
++          " in ",
++          ctx.getDisplayName(),
++          ".");
+     }
+   }
+ }
+@@ -798,7 +839,15 @@ inline void unifyInputDim(InferenceContext& ctx, size_t input_index, int dim_ind
+     // This shape is expected to have rank > dim_index:
+     if (input_shape.dim_size() <= dim_index) {
+       fail_shape_inference(
+-          "Input ", input_index, " expected to have rank >", dim_index, " but has rank ", input_shape.dim_size());
++          "Input ",
++          input_index,
++          " expected to have rank >",
++          dim_index,
++          " but has rank ",
++          input_shape.dim_size(),
++          " in ",
++          ctx.getDisplayName(),
++          ".");
+     }
+     const Dim& input_dim = input_shape.dim(dim_index);
+     // Now, unify dim and input_dim:
+diff --git a/onnx/shape_inference/implementation.cc b/onnx/shape_inference/implementation.cc
+index 8723dcd4..8249fc59 100644
+--- a/onnx/shape_inference/implementation.cc
++++ b/onnx/shape_inference/implementation.cc
+@@ -906,7 +906,7 @@ struct FunctionInferenceContext : public InferenceContext {
+       const std::vector<TypeProto>& input_types,
+       const std::vector<AttributeProto>& attributes,
+       const ShapeInferenceOptions& options)
+-      : input_types_(input_types), options_(options) {
++      : input_types_(input_types), options_(options), func_proto_(&func_proto) {
+     for (const auto& attr : attributes) {
+       attributesByName_[attr.name()] = &attr;
+     }
+@@ -971,11 +971,25 @@ struct FunctionInferenceContext : public InferenceContext {
+     return std::move(output_types_);
+   }
+
++  std::string getDisplayName() const override {
++    if (func_proto_ == nullptr)
++      return "";
++    if (func_proto_->domain().empty()) {
++      if (func_proto_->name().empty())
++        return "";
++      return MakeString("function ", func_proto_->name());
++    }
++    if (func_proto_->name().empty())
++      return MakeString("function [", func_proto_->domain(), "]");
++    return MakeString("function ", func_proto_->name(), "[", func_proto_->domain(), "]");
++  }
++
+  private:
+   const std::vector<TypeProto>& input_types_;
+   std::vector<TypeProto> output_types_;
+   std::unordered_map<std::string, const AttributeProto*> attributesByName_;
+   ShapeInferenceOptions options_;
++  const FunctionProto* func_proto_;
+ };
+
+ std::vector<TypeProto> InferFunctionOutputTypes(
+diff --git a/onnx/shape_inference/implementation.h b/onnx/shape_inference/implementation.h
+index 2c63c910..b0e4c32d 100644
+--- a/onnx/shape_inference/implementation.h
++++ b/onnx/shape_inference/implementation.h
+@@ -146,7 +146,7 @@ struct InferenceContextImpl : public InferenceContext {
+       const ShapeInferenceOptions& options,
+       DataValueMap* generatedShapeData = nullptr,
+       GraphInferenceContext* graphInferenceContext = nullptr)
+-      : graphInferenceContext_{graphInferenceContext}, options_(options) {
++      : graphInferenceContext_{graphInferenceContext}, options_(options), node_(&n) {
+     for (auto& attr : *n.mutable_attribute()) {
+       attributesByName_[attr.name()] = &attr;
+       if (attr.has_g()) {
+@@ -277,6 +277,19 @@ struct InferenceContextImpl : public InferenceContext {
+     return inferencer;
+   }
+
++  std::string getDisplayName() const override {
++    if (node_ == nullptr)
++      return "";
++    if (node_->domain().empty()) {
++      if (node_->name().empty())
++        return MakeString("node ", node_->op_type());
++      return MakeString("node ", node_->op_type(), " (", node_->name(), ")");
++    }
++    if (node_->name().empty())
++      return MakeString("node ", node_->op_type(), "[", node_->domain(), "]");
++    return MakeString("node ", node_->op_type(), "[", node_->domain(), "]", " (", node_->name(), ")");
++  }
++
+   std::vector<const TensorProto*> allInputData_;
+   std::vector<const SparseTensorProto*> allInputSparseData_;
+   std::vector<const TensorShapeProto*> allShapeInputData_;
+@@ -289,6 +302,7 @@ struct InferenceContextImpl : public InferenceContext {
+   // mutable as internal cache of GraphInferencer instances
+   mutable std::unordered_map<std::string, std::unique_ptr<GraphInferencer>> graphAttributeInferencers_;
+   ShapeInferenceOptions options_;
++  NodeProto* node_;
+ };
+
+ struct DataPropagationContextImpl : public DataPropagationContext {
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index ec9b1614488a7..f42f32d63d1fa 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -178,7 +178,7 @@ TEST(Random, InvalidDType) {
     test.AddAttribute("shape", dims);
 
     test.AddOutput<double>("Y", dims, expected_output);
-    test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type.");
+    test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomNormal) [TypeInferenceError] Attribute dtype does not specify a valid type in .");
   }
 
   {
@@ -194,7 +194,7 @@ TEST(Random, InvalidDType) {
     test.AddAttribute("shape", dims);
 
     test.AddOutput<double>("Y", dims, expected_output);
-    test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type.");
+    test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomUniform) [TypeInferenceError] Attribute dtype does not specify a valid type in .");
   }
 
   {
@@ -210,7 +210,7 @@ TEST(Random, InvalidDType) {
 
     test.AddInput<int32_t>("X", dims, input);
     test.AddOutput<double>("Y", dims, expected_output);
-    test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type.");
+    test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomNormalLike) [TypeInferenceError] Attribute dtype does not specify a valid type in .");
   }
 
   {
@@ -226,7 +226,7 @@ TEST(Random, InvalidDType) {
 
     test.AddInput<int32_t>("X", dims, input);
     test.AddOutput<double>("Y", dims, expected_output);
-    test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type.");
+    test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomUniformLike) [TypeInferenceError] Attribute dtype does not specify a valid type in .");
   }
 }
 
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index 2a8d2de982e79..92f803030ada4 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -181,6 +181,64 @@ static void propagateRecvOutputTensorElemTypes(
   }
 }
 
+void SendShapeInfer(ONNX_NAMESPACE::InferenceContext& ctx) {
+  if (ctx.getNumInputs() < 3) {
+    fail_shape_inference("Send must have at least three inputs.");
+  } else {
+    if (hasInputShape(ctx, 0)) {
+      auto& signal_input_shape = getInputShape(ctx, 0);
+      if (static_cast<int>(signal_input_shape.dim_size()) != 0) {
+        fail_shape_inference("InputSignal of Send must be a scalar.");
+      }
+    }
+    if (hasInputShape(ctx, 1)) {
+      auto& remote_input_shape = getInputShape(ctx, 1);
+      if (static_cast<int>(remote_input_shape.dim_size()) != 0) {
+        fail_shape_inference("Remote of Send must be a scalar.");
+      }
+    }
+
+    checkSendInputTensorElemTypes(ctx, "element_types", ctx.getNumInputs() - 2);
+  }
+
+  if (ctx.getNumOutputs() != 1) {
+    fail_shape_inference("Send must have one output.");
+  }
+
+  auto output_element_type = ctx.getOutputType(0)->mutable_tensor_type();
+  output_element_type->set_elem_type(TensorProto::BOOL);
+  ONNX_NAMESPACE::TensorShapeProto output_shape;
+  updateOutputShape(ctx, 0, {});
+  updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
+}
+
+void RecvShapeInfer(ONNX_NAMESPACE::InferenceContext& ctx) {
+  if (ctx.getNumInputs() != 2) {
+    fail_shape_inference("Recv must have two inputs.");
+  } else {
+    if (hasInputShape(ctx, 0)) {
+      auto& signal_input_shape = getInputShape(ctx, 0);
+      if (static_cast<int>(signal_input_shape.dim_size()) != 0) {
+        fail_shape_inference("InputSignal of Recv must be a scalar.");
+      }
+    }
+    if (hasInputShape(ctx, 1)) {
+      auto& remote_input_shape = getInputShape(ctx, 1);
+      if (static_cast<int>(remote_input_shape.dim_size()) != 0) {
+        fail_shape_inference("Remote of Recv must be a scalar.");
+      }
+    }
+  }
+
+  if (ctx.getNumOutputs() < 2) {
+    fail_shape_inference("Recv must have at least two outputs.");
+  }
+
+  updateOutputShape(ctx, 0, {});
+  updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
+  propagateRecvOutputTensorElemTypes(ctx, "element_types", ctx.getNumOutputs() - 1);
+}
+
 TensorProto ToDimensionOneFloatTensor(float value) {
   auto t = ToTensor(std::vector<float>({value}));
   t.add_dims(1);
@@ -3388,30 +3446,7 @@ Return true if all elements are true and false otherwise.
           "Constrain types to boolean tensors.")
       .TypeConstraint("V", OpSchema::all_tensor_types(), "All Tensor types")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        if (ctx.getNumInputs() < 3) {
-          fail_shape_inference("Send must have at least three inputs.");
-        } else {
-          auto& signal_input_shape = getInputShape(ctx, 0);
-          if (static_cast<int>(signal_input_shape.dim_size()) != 0) {
-            fail_shape_inference("InputSignal of Send must be a scalar.");
-          }
-          auto& remote_input_shape = getInputShape(ctx, 1);
-          if (static_cast<int>(remote_input_shape.dim_size()) != 0) {
-            fail_shape_inference("Remote of Send must be a scalar.");
-          }
-
-          checkSendInputTensorElemTypes(ctx, "element_types", ctx.getNumInputs() - 2);
-        }
-
-        if (ctx.getNumOutputs() != 1) {
-          fail_shape_inference("Send must have one output.");
-        }
-
-        auto output_element_type = ctx.getOutputType(0)->mutable_tensor_type();
-        output_element_type->set_elem_type(TensorProto::BOOL);
-        ONNX_NAMESPACE::TensorShapeProto output_shape;
-        updateOutputShape(ctx, 0, {});
-        updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
+        SendShapeInfer(ctx);
       });
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(Recv)
@@ -3437,26 +3472,7 @@ Return true if all elements are true and false otherwise.
           "Constrain types to boolean tensors.")
       .TypeConstraint("V", OpSchema::all_tensor_types(), "All Tensor types")
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        if (ctx.getNumInputs() != 2) {
-          fail_shape_inference("Recv must have two inputs.");
-        } else {
-          auto& signal_input_shape = getInputShape(ctx, 0);
-          if (static_cast<int>(signal_input_shape.dim_size()) != 0) {
-            fail_shape_inference("InputSignal of Recv must be a scalar.");
-          }
-          auto& remote_input_shape = getInputShape(ctx, 1);
-          if (static_cast<int>(remote_input_shape.dim_size()) != 0) {
-            fail_shape_inference("Remote of Recv must be a scalar.");
-          }
-        }
-
-        if (ctx.getNumOutputs() < 2) {
-          fail_shape_inference("Recv must have at least two outputs.");
-        }
-
-        updateOutputShape(ctx, 0, {});
-        updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
-        propagateRecvOutputTensorElemTypes(ctx, "element_types", ctx.getNumOutputs() - 1);
+        RecvShapeInfer(ctx);
       });
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(MegatronF)

From dbff0cd09860b60bd0a251c1dbe76785b0b2818c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sun, 28 Jul 2024 13:03:17 -0700
Subject: [PATCH 41/57] [js/node] enable float16 support for Node.js binding
 (#20581)

### Description
enable float16 support for Node.js binding.

data of float16 tensor uses `Uint16Array`.
---
 js/node/src/tensor_helper.cc | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index 1c0b141e6a44f..1062d89f76c5f 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -38,13 +38,13 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = {
     2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
     2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
     4, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
-    8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64         INT64 not working in Javascript
+    8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
     0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        N/A
     1, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
-    0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 not working in Javascript
+    2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16
     8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
     4, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
-    8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64        UINT64 not working in Javascript
+    8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
     0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
     0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
     0  // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
@@ -60,13 +60,13 @@ constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = {
     napi_uint16_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
     napi_int16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
     napi_int32_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
-    napi_bigint64_array,        // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64         INT64 not working i
+    napi_bigint64_array,        // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
     (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
     napi_uint8_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
-    (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 not working
+    napi_uint16_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
     napi_float64_array,         // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
     napi_uint32_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
-    napi_biguint64_array,       // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64        UINT64 not working
+    napi_biguint64_array,       // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
     (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
     (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
     (napi_typedarray_type)(-1)  // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
@@ -182,9 +182,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo *
 
     char *buffer = reinterpret_cast<char *>(tensorDataTypedArray.ArrayBuffer().Data());
     size_t bufferByteOffset = tensorDataTypedArray.ByteOffset();
-    // there is a bug in TypedArray::ElementSize(): https://github.com/nodejs/node-addon-api/pull/705
-    // TODO: change to TypedArray::ByteLength() in next node-addon-api release.
-    size_t bufferByteLength = tensorDataTypedArray.ElementLength() * DATA_TYPE_ELEMENT_SIZE_MAP[elemType];
+    size_t bufferByteLength = tensorDataTypedArray.ByteLength();
     return Ort::Value::CreateTensor(memory_info, buffer + bufferByteOffset, bufferByteLength,
                                     dims.empty() ? nullptr : &dims[0], dims.size(), elemType);
   }

From 5bc12bf209304e7f5800845bd612bb3e7b7ab918 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Mon, 29 Jul 2024 23:47:41 +0800
Subject: [PATCH 42/57] [js/webgpu] Add activation for conv3d naive (#21466)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ops/3rd-party/conv3d_naive_webgpu.ts      |  64 +++++-----
 js/web/test/data/ops/fused-conv3dncdhw.jsonc  | 112 ++++++++++++++++++
 js/web/test/suite-test-list.jsonc             |   1 +
 3 files changed, 149 insertions(+), 28 deletions(-)
 create mode 100644 js/web/test/data/ops/fused-conv3dncdhw.jsonc

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts
index f428293add599..a2e5428385101 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts
@@ -26,6 +26,9 @@ import {ShapeUtil} from '../../../util';
 import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
 import {createTensorShapeVariables, getElementAt, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvAttributes} from '../conv';
+import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils';
+
+import {typeSnippet} from './activation_util';
 
 const arrayProduct = (arr: number[]) => {
   let product = 1;
@@ -218,8 +221,8 @@ export const computeConv3DInfo =
 export const createConv3DNaiveProgramInfo =
     (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[],
      filterDims: readonly number[], pads: readonly number[], dataFormat: string): ProgramInfo => {
-      const isChannelsLast = dataFormat === 'channelsLast';
-      const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
+      const isChannelLast = dataFormat === 'channelsLast';
+      const inChannels = isChannelLast ? inputs[0].dims[3] : inputs[0].dims[1];
       // TODO: enable vec4.
       const isVec4 = false;
       const workGroupSize: [number, number, number] = [64, 1, 1];
@@ -228,13 +231,14 @@ export const createConv3DNaiveProgramInfo =
 
       LOG_DEBUG('verbose', () => `[conv3d_naive_webgpu] dispatch = ${dispatch}`);
 
-      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;
+      const innerElementSize = isVec4 ? (isChannelLast && inChannels % 4 !== 0 ? 3 : 4) : 1;
       const outputSize = ShapeUtil.size(outputShape);
       const programUniforms: ProgramUniform[] = [
         {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: filterDims},
         {type: DataType.uint32, data: pads}, {type: DataType.uint32, data: attributes.strides},
         {type: DataType.uint32, data: attributes.dilations}
       ];
+      appendActivationUniformsData(attributes, programUniforms);
       programUniforms.push(...createTensorShapeVariables(inputs[0].dims, inputs[1].dims));
       const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank'];
       const hasBias = inputs.length === 3;
@@ -251,6 +255,7 @@ export const createConv3DNaiveProgramInfo =
           {name: 'strides', type: 'u32', length: attributes.strides.length},
           {name: 'dilations', type: 'u32', length: attributes.dilations.length}
         ];
+        appendActivationUniforms(attributes, uniforms);
         // TODO: support component 2, 3.
         const components = isVec4 ? 4 : 1;
         const t = tensorTypeToWsglStorageType(inputs[0].dataType);
@@ -266,10 +271,12 @@ export const createConv3DNaiveProgramInfo =
           inputVariables.push(bias);
           declareFunctions += `
         fn getBiasByOutputCoords(coords : array<u32, 5>) -> ${isVec4 ? `vec4<${t}>` : t} {
-          return bias[${isChannelsLast ? getElementAt('coords', 4, 5) : getElementAt('coords', 1, 5)}${
+          return bias[${isChannelLast ? getElementAt('coords', 4, 5) : getElementAt('coords', 1, 5)}${
               isVec4 ? '/ 4' : ''}];
         }`;
         }
+        const resType = typeSnippet(innerElementSize, t);
+        const applyActivation = getActivationSnippet(attributes, resType, t);
 
         return `
             ${declareFunctions}
@@ -287,28 +294,28 @@ export const createConv3DNaiveProgramInfo =
               let coords = ${output.offsetToIndices('global_idx')};
               let batch = ${getElementAt('coords', 0, x.rank)};
               let d2 = ${
-            isChannelsLast ? getElementAt('coords', x.rank - 1, x.rank) : getElementAt('coords', 1, x.rank)};
+            isChannelLast ? getElementAt('coords', x.rank - 1, x.rank) : getElementAt('coords', 1, x.rank)};
               let xFRCCorner = vec3<u32>(${
-            isChannelsLast ? getElementAt('coords', 1, x.rank) : getElementAt('coords', 2, x.rank)},
-              ${isChannelsLast ? getElementAt('coords', 2, x.rank) : getElementAt('coords', 3, x.rank)},
+            isChannelLast ? getElementAt('coords', 1, x.rank) : getElementAt('coords', 2, x.rank)},
+              ${isChannelLast ? getElementAt('coords', 2, x.rank) : getElementAt('coords', 3, x.rank)},
               ${
-            isChannelsLast ? getElementAt('coords', 3, x.rank) :
-                             getElementAt('coords', 4, x.rank)}) * uniforms.strides - uniforms.pads;
+            isChannelLast ? getElementAt('coords', 3, x.rank) :
+                            getElementAt('coords', 4, x.rank)}) * uniforms.strides - uniforms.pads;
               let xFCorner = xFRCCorner.x;
               let xRCorner = xFRCCorner.y;
               let xCCorner = xFRCCorner.z;
               let xShapeY = ${
-            isChannelsLast ? getElementAt('uniforms.x_shape', 1, x.rank) : getElementAt('uniforms.x_shape', 2, x.rank)};
+            isChannelLast ? getElementAt('uniforms.x_shape', 1, x.rank) : getElementAt('uniforms.x_shape', 2, x.rank)};
               let xShapeZ = ${
-            isChannelsLast ? getElementAt('uniforms.x_shape', 2, x.rank) : getElementAt('uniforms.x_shape', 3, x.rank)};
+            isChannelLast ? getElementAt('uniforms.x_shape', 2, x.rank) : getElementAt('uniforms.x_shape', 3, x.rank)};
               let xShapeW = ${
-            isChannelsLast ? getElementAt('uniforms.x_shape', 3, x.rank) : getElementAt('uniforms.x_shape', 4, x.rank)};
+            isChannelLast ? getElementAt('uniforms.x_shape', 3, x.rank) : getElementAt('uniforms.x_shape', 4, x.rank)};
               let xShapeU = ${
-            isChannelsLast ? getElementAt('uniforms.x_shape', 4, x.rank) : getElementAt('uniforms.x_shape', 1, x.rank)};
+            isChannelLast ? getElementAt('uniforms.x_shape', 4, x.rank) : getElementAt('uniforms.x_shape', 1, x.rank)};
               let inputDepthNearestVec4 = (xShapeU / 4) * 4;
               let inputDepthVec4Remainder = xShapeU % 4;
 
-              var dotProd = 0.0;
+              var value = 0.0;
               for (var wF = 0u; wF < uniforms.filter_dims[0]; wF++) {
                 let xF = xFCorner + wF * uniforms.dilations[0];
                 if (xF < 0 || xF >= xShapeY) {
@@ -329,13 +336,13 @@ export const createConv3DNaiveProgramInfo =
 
                     for (var d1 = 0u; d1 < inputDepthNearestVec4; d1 += 4) {
                       ${
-            isChannelsLast ? `let xValues = vec4<f32>(
+            isChannelLast ? `let xValues = vec4<f32>(
                                getX(batch, xF, xR, xC, d1),
                                getX(batch, xF, xR, xC, d1 + 1),
                                getX(batch, xF, xR, xC, d1 + 2),
                                getX(batch, xF, xR, xC, d1 + 3));
                             ` :
-                             `let xValues = vec4<f32>(
+                            `let xValues = vec4<f32>(
                                getX(batch, d1, xF, xR, xC),
                                getX(batch, d1 + 1, xF, xR, xC),
                                getX(batch, d1 + 2, xF, xR, xC),
@@ -346,36 +353,36 @@ export const createConv3DNaiveProgramInfo =
                               getW(d2, d1 + 1, wF, wR, wC),
                               getW(d2, d1 + 2, wF, wR, wC),
                               getW(d2, d1 + 3, wF, wR, wC));
-                      dotProd += dot(xValues, wValues);
+                      value += dot(xValues, wValues);
                     }
                     if (inputDepthVec4Remainder == 1) {
                         ${
-            isChannelsLast ? `dotProd += getX(batch, xF, xR, xC, inputDepthNearestVec4)
+            isChannelLast ? `value += getX(batch, xF, xR, xC, inputDepthNearestVec4)
                           * getW(d2, inputDepthNearestVec4, wF, wR, wC);` :
-                             `dotProd += getX(batch, inputDepthNearestVec4, xF, xR, xC)
+                            `value += getX(batch, inputDepthNearestVec4, xF, xR, xC)
                           * getW(d2, inputDepthNearestVec4, wF, wR, wC);`}
                     } else if (inputDepthVec4Remainder == 2) {
                       ${
-            isChannelsLast ? `let xValues = vec2<f32>(
+            isChannelLast ? `let xValues = vec2<f32>(
                         getX(batch, xF, xR, xC, inputDepthNearestVec4),
                         getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1));
                       ` :
-                             `let xValues = vec2<f32>(
+                            `let xValues = vec2<f32>(
                         getX(batch, inputDepthNearestVec4, xF, xR, xC),
                         getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC));
                     `}
                     let wValues = vec2<f32>(
                       getW(d2, inputDepthNearestVec4, wF, wR, wC),
                       getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC));
-                      dotProd += dot(xValues, wValues);
+                      value += dot(xValues, wValues);
                     } else if (inputDepthVec4Remainder == 3) {
                       ${
-            isChannelsLast ? `let xValues = vec3<f32>(
+            isChannelLast ? `let xValues = vec3<f32>(
                         getX(batch, xF, xR, xC, inputDepthNearestVec4),
                         getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1),
                         getX(batch, xF, xR, xC, inputDepthNearestVec4 + 2));
                       ` :
-                             `let xValues = vec3<f32>(
+                            `let xValues = vec3<f32>(
                         getX(batch, inputDepthNearestVec4, xF, xR, xC),
                         getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC),
                         getX(batch, inputDepthNearestVec4 + 2, xF, xR, xC));
@@ -384,19 +391,20 @@ export const createConv3DNaiveProgramInfo =
                       getW(d2, inputDepthNearestVec4, wF, wR, wC),
                       getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC),
                       getW(d2, inputDepthNearestVec4 + 2, wF, wR, wC));
-                      dotProd += dot(xValues, wValues);
+                      value += dot(xValues, wValues);
                     }
                   }
                 }
               }
-              ${hasBias ? 'dotProd = dotProd + getBiasByOutputCoords(coords)' : ''};
-              result[global_idx] = f32(dotProd);
+              ${hasBias ? 'value = value + getBiasByOutputCoords(coords)' : ''};
+              ${applyActivation}
+              result[global_idx] = f32(value);
           }`;
       };
       return {
         name: 'Conv3DNaive',
         shaderCache:
-            {hint: `${attributes.cacheKey};${isChannelsLast};${innerElementSize};${hasBias}`, inputDependencies},
+            {hint: `${attributes.cacheKey};${isChannelLast};${innerElementSize};${hasBias}`, inputDependencies},
         getRunData: () => ({
           outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
           dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
diff --git a/js/web/test/data/ops/fused-conv3dncdhw.jsonc b/js/web/test/data/ops/fused-conv3dncdhw.jsonc
new file mode 100644
index 0000000000000..1801ca380aa09
--- /dev/null
+++ b/js/web/test/data/ops/fused-conv3dncdhw.jsonc
@@ -0,0 +1,112 @@
+[
+  {
+    "name": "fused conv3d with relu, x=[1, 1, 2, 1, 2], f=[2, 1, 2, 1, 2], s=1, d=1, p=valid, relu",
+    "operator": "FusedConv",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "activation", "data": "Relu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" },
+      { "name": "auto_pad", "data": "VALID", "type": "string" },
+      { "name": "strides", "data": [1, 1, 1], "type": "ints" },
+      { "name": "dilations", "data": [1, 1, 1], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.25, 0.5, 0.75, 1],
+            "dims": [1, 1, 2, 1, 2],
+            "type": "float32"
+          },
+          {
+            "data": [-0.125, -0.25, -0.375, 0.5, 0.625, -0.75, -0.875, -1],
+            "dims": [2, 1, 2, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0625, 0],
+            "dims": [1, 2, 1, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused conv3d with clip",
+    "operator": "FusedConv",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "activation", "data": "Clip", "type": "string" },
+      { "name": "activation_params", "data": [1.0, 3.0], "type": "floats" },
+      { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" },
+      { "name": "auto_pad", "data": "VALID", "type": "string" },
+      { "name": "strides", "data": [1, 1, 1], "type": "ints" },
+      { "name": "dilations", "data": [1, 1, 1], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.25, 0.5, 0.75, 1],
+            "dims": [1, 1, 2, 1, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],
+            "dims": [2, 1, 2, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2.1875],
+            "dims": [1, 2, 1, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "fused conv3d with HardSigmoid, x=[1, 1, 2, 1, 2], f=[2, 1, 2, 1, 2], s=1, d=1, p=valid, relu",
+    "operator": "FusedConv",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "activation_params", "data": [0.1, 0.3], "type": "floats" },
+      { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" },
+      { "name": "auto_pad", "data": "VALID", "type": "string" },
+      { "name": "strides", "data": [1, 1, 1], "type": "ints" },
+      { "name": "dilations", "data": [1, 1, 1], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.25, 0.5, 0.75, 1],
+            "dims": [1, 1, 2, 1, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],
+            "dims": [2, 1, 2, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.39375001192092896, 0.518750011920929],
+            "dims": [1, 2, 1, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 4a3a23bfe91b4..4aaf9d16b2b0e 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1358,6 +1358,7 @@
       "fast-gelu.jsonc",
       "floor.jsonc",
       "fused-conv.jsonc",
+      "fused-conv3dncdhw.jsonc",
       "gather-elements.jsonc",
       "gemm.jsonc",
       "global-average-pool.jsonc",

From 94eb70d98348d83343207e113f9abaa0e7c6ea37 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Mon, 29 Jul 2024 23:50:14 +0800
Subject: [PATCH 43/57] [WebNN EP] Add labels for all WebNN operators (#21516)

In order to provide more diagnosable error messages for developers.

Spec change: https://github.com/webmachinelearning/webnn/pull/742
---
 .../builders/impl/activation_op_builder.cc    | 13 ++++---
 .../builders/impl/argmax_min_op_builder.cc    |  1 +
 .../webnn/builders/impl/binary_op_builder.cc  | 15 ++++---
 .../webnn/builders/impl/cast_op_builder.cc    |  5 ++-
 .../webnn/builders/impl/clip_op_builder.cc    |  1 +
 .../webnn/builders/impl/concat_op_builder.cc  |  5 ++-
 .../webnn/builders/impl/conv_op_builder.cc    | 16 +++++++-
 .../impl/dequantizeLinear_op_builder.cc       | 17 ++++++--
 .../impl/dynamicQuantizeLinear_op_builder.cc  |  3 +-
 .../webnn/builders/impl/expand_op_builder.cc  |  6 ++-
 .../webnn/builders/impl/flatten_op_builder.cc |  4 +-
 .../webnn/builders/impl/gather_op_builder.cc  |  1 +
 .../webnn/builders/impl/gemm_op_builder.cc    | 39 +++++++++++++++----
 .../webnn/builders/impl/logical_op_builder.cc | 12 +++---
 .../webnn/builders/impl/max_min_op_builder.cc | 10 +++--
 .../builders/impl/normalization_op_builder.cc | 16 ++++++--
 .../webnn/builders/impl/pad_op_builder.cc     |  6 ++-
 .../webnn/builders/impl/pool_op_builder.cc    |  1 +
 .../builders/impl/reduction_op_builder.cc     |  1 +
 .../webnn/builders/impl/reshape_op_builder.cc |  7 +++-
 .../webnn/builders/impl/resize_op_builder.cc  |  1 +
 .../webnn/builders/impl/shape_op_builder.cc   |  9 ++++-
 .../webnn/builders/impl/slice_op_builder.cc   |  5 ++-
 .../webnn/builders/impl/softmax_op_builder.cc |  4 +-
 .../webnn/builders/impl/split_op_builder.cc   |  1 +
 .../impl/squeeze_unsqueeze_op_builder.cc      |  8 +++-
 .../webnn/builders/impl/ternary_op_builder.cc |  4 +-
 .../builders/impl/transpose_op_builder.cc     |  1 +
 .../builders/impl/triangular_op_builder.cc    |  1 +
 .../webnn/builders/impl/unary_op_builder.cc   | 30 +++++++-------
 30 files changed, 180 insertions(+), 63 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
index af0f0133b497a..626aaf5c71b74 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
@@ -36,6 +36,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   NodeAttrHelper helper(node);
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   if (op_type == "Elu") {
     options.set("alpha", helper.Get("alpha", 1.0f));
     output = model_builder.GetBuilder().call<emscripten::val>("elu", input, options);
@@ -46,20 +47,20 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     options.set("beta", helper.Get("beta", 0.5f));
     output = model_builder.GetBuilder().call<emscripten::val>("hardSigmoid", input, options);
   } else if (op_type == "HardSwish") {
-    output = model_builder.GetBuilder().call<emscripten::val>("hardSwish", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("hardSwish", input, options);
   } else if (op_type == "LeakyRelu") {
     options.set("alpha", helper.Get("alpha", 0.0f));
     output = model_builder.GetBuilder().call<emscripten::val>("leakyRelu", input, options);
   } else if (op_type == "Relu") {
-    output = model_builder.GetBuilder().call<emscripten::val>("relu", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("relu", input, options);
   } else if (op_type == "Sigmoid") {
-    output = model_builder.GetBuilder().call<emscripten::val>("sigmoid", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("sigmoid", input, options);
   } else if (op_type == "Softplus") {
-    output = model_builder.GetBuilder().call<emscripten::val>("softplus", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("softplus", input, options);
   } else if (op_type == "Softsign") {
-    output = model_builder.GetBuilder().call<emscripten::val>("softsign", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("softsign", input, options);
   } else if (op_type == "Tanh") {
-    output = model_builder.GetBuilder().call<emscripten::val>("tanh", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("tanh", input, options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 1ae63a644a287..05f3a742a3775 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -47,6 +47,7 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   options.set("keepDimensions", keep_dims == 1);
   // TODO(Honry): check whether int64 output data type is supported by WebNN opSupportLimits() API.
   options.set("outputDataType", "int64");
+  options.set("label", node.Name());
   emscripten::val output = emscripten::val::object();
 
   const auto& op_type = node.OpType();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index 23e19d5943144..555de68cd60fe 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -35,18 +35,21 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name());
   emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name());
   emscripten::val output = emscripten::val::object();
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+
   if (op_type == "Add") {
-    output = model_builder.GetBuilder().call<emscripten::val>("add", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("add", input0, input1, options);
   } else if (op_type == "Sub") {
-    output = model_builder.GetBuilder().call<emscripten::val>("sub", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("sub", input0, input1, options);
   } else if (op_type == "Mul") {
-    output = model_builder.GetBuilder().call<emscripten::val>("mul", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("mul", input0, input1, options);
   } else if (op_type == "Div") {
-    output = model_builder.GetBuilder().call<emscripten::val>("div", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("div", input0, input1, options);
   } else if (op_type == "Pow") {
-    output = model_builder.GetBuilder().call<emscripten::val>("pow", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("pow", input0, input1, options);
   } else if (op_type == "PRelu") {
-    output = model_builder.GetBuilder().call<emscripten::val>("prelu", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("prelu", input0, input1, options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
index a97d71b90de55..a08e1681a8464 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
@@ -69,8 +69,11 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                              node.Name(), " type: ", to_type);
   }
 
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+
   emscripten::val output =
-      model_builder.GetBuilder().call<emscripten::val>("cast", input, emscripten::val(operand_type));
+      model_builder.GetBuilder().call<emscripten::val>("cast", input, emscripten::val(operand_type), options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index e6403a4cd12dc..b5c3206072d50 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -53,6 +53,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                     "GetClipMinMax failed");
   options.set("minValue", minValue);
   options.set("maxValue", maxValue);
+  options.set("label", node.Name());
   emscripten::val input = model_builder.GetOperand(input_name);
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("clamp", input, options);
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index e4f98b09e03c5..dedc76b80e978 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -42,8 +42,11 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     inputs.push_back(model_builder.GetOperand(input->Name()));
   }
 
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+
   emscripten::val output =
-      model_builder.GetBuilder().call<emscripten::val>("concat", emscripten::val::array(inputs), axis);
+      model_builder.GetBuilder().call<emscripten::val>("concat", emscripten::val::array(inputs), axis, options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 320aaa03930fd..4f3f7459a7b5b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -242,6 +242,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   }
 
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   ORT_RETURN_IF_ERROR(SetConvBaseOptions(
       model_builder, node, options, input_shape, weight_shape, strides, dilations, pads, is_nhwc, is_conv1d, logger));
   bool depthwise = false;
@@ -276,7 +277,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (!is_nhwc || !is_constant_weight) {
       // The weight_shape has been appended 1's, reshape weight operand.
       std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(weight_shape);
-      filter = model_builder.GetBuilder().call<emscripten::val>("reshape", filter, emscripten::val::array(new_shape));
+      emscripten::val reshape_options = emscripten::val::object();
+      reshape_options.set("label", node.Name() + "_reshape_filter");
+      filter = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                filter,
+                                                                emscripten::val::array(new_shape),
+                                                                reshape_options);
     }
   }
 
@@ -293,6 +299,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       perm = {0, 2, 3, 1};  // L_0231
     }
     transpose_options.set("permutation", emscripten::val::array(perm));
+    transpose_options.set("label", node.Name() + "_transpose_filter");
     filter = model_builder.GetBuilder().call<emscripten::val>("transpose", filter, transpose_options);
   }
 
@@ -323,7 +330,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     std::vector<int64_t> output_shape;
     ORT_RETURN_IF_NOT(GetShape(*output_defs[0], output_shape, logger), "Cannot get output shape");
     std::vector<uint32_t> new_shape = GetVecUint32FromVecInt64(output_shape);
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    emscripten::val reshape_options = emscripten::val::object();
+    reshape_options.set("label", node.Name() + "_reshape_output");
+    output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                              output,
+                                                              emscripten::val::array(new_shape),
+                                                              reshape_options);
   }
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
index 66d502a4e6727..93a12a696cce1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
@@ -50,11 +50,22 @@ Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
     std::vector<int32_t> target_shape{static_cast<int>(input_shape[axis])};
     target_shape.insert(target_shape.begin(), axis, 1);
     target_shape.insert(target_shape.end(), input_shape.size() - axis - 1, 1);
-    scale = model_builder.GetBuilder().call<emscripten::val>("reshape", scale, emscripten::val::array(target_shape));
+    emscripten::val reshape_scale_options = emscripten::val::object();
+    reshape_scale_options.set("label", node.Name() + "_reshape_scale");
+    scale = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                             scale,
+                                                             emscripten::val::array(target_shape),
+                                                             reshape_scale_options);
+    emscripten::val reshape_zero_point_options = emscripten::val::object();
+    reshape_zero_point_options.set("label", node.Name() + "_reshape_zero_point");
     zero_point = model_builder.GetBuilder().call<emscripten::val>("reshape",
-                                                                  zero_point, emscripten::val::array(target_shape));
+                                                                  zero_point,
+                                                                  emscripten::val::array(target_shape),
+                                                                  reshape_zero_point_options);
   }
-  output = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear", input, scale, zero_point);
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+  output = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear", input, scale, zero_point, options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
index 3b5f64584b828..55746bb1f61f0 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -31,8 +31,9 @@ Status DynamicQuantizaLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model
   std::vector<int64_t> input_shape;
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
-  output_array = model_builder.GetBuilder().call<emscripten::val>("dynamicQuantizeLinear", input);
+  output_array = model_builder.GetBuilder().call<emscripten::val>("dynamicQuantizeLinear", input, options);
 
   for (size_t i = 0, count = output_array["length"].as<size_t>(); i < count; i++) {
     model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i]));
diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
index 9c75c00fa9273..c8cea833983b1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
@@ -53,10 +53,14 @@ Status ExpandOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::vector<int64_t> output_shape;
   ORT_RETURN_IF_NOT(GetBidirectionalBroadcastShape(input_shape, new_shape, output_shape), "Cannot get output shape.");
 
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+
   emscripten::val output =
       model_builder.GetBuilder().call<emscripten::val>("expand",
                                                        input,
-                                                       emscripten::val::array(GetVecUint32FromVecInt64(output_shape)));
+                                                       emscripten::val::array(GetVecUint32FromVecInt64(output_shape)),
+                                                       options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
index 31b1bd92a9503..d0ece026a7048 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
@@ -52,8 +52,10 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                      SafeInt<uint32_t>(num_post_axis_elements)};
 
   emscripten::val inputs = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>(
-      "reshape", inputs, emscripten::val::array(new_shape));
+      "reshape", inputs, emscripten::val::array(new_shape), options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
index 014a08616c44f..23233539d34c7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc
@@ -42,6 +42,7 @@ Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   emscripten::val indices = model_builder.GetOperand(input_defs[1]->Name());
   emscripten::val options = emscripten::val::object();
   options.set("axis", axis);
+  options.set("label", node.Name());
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("gather", input, indices, options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 53f885019ab2f..bd452b118fe3e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -39,6 +39,8 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   emscripten::val a = model_builder.GetOperand(node.InputDefs()[a_idx]->Name());
   emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name());
   emscripten::val output = emscripten::val::object();
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   if (op_type == "MatMul") {
     std::vector<int64_t> a_shape;
     if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
@@ -53,23 +55,34 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (a_shape.size() == 1) {
       extended_a_shape = true;
       a_shape.insert(a_shape.begin(), 1);
+      emscripten::val reshape_a_options = emscripten::val::object();
+      reshape_a_options.set("label", node.Name() + "_reshape_a");
       a = model_builder.GetBuilder().call<emscripten::val>("reshape", a,
-                                                           emscripten::val::array(GetVecUint32FromVecInt64(a_shape)));
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(a_shape)),
+                                                           reshape_a_options);
     }
     // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
     bool extended_b_shape = false;
     if (b_shape.size() == 1) {
       extended_b_shape = true;
       b_shape.push_back(1);
+      emscripten::val reshape_b_options = emscripten::val::object();
+      reshape_b_options.set("label", node.Name() + "_reshape_b");
       b = model_builder.GetBuilder().call<emscripten::val>("reshape", b,
-                                                           emscripten::val::array(GetVecUint32FromVecInt64(b_shape)));
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(b_shape)),
+                                                           reshape_b_options);
     }
 
-    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b, options);
 
+    emscripten::val reshape_output_options = emscripten::val::object();
+    reshape_output_options.set("label", node.Name() + "_reshape_output");
     // If the inputs are both 1D， reduce the output to a scalar.
     if (extended_a_shape && extended_b_shape) {
-      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array());
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                output,
+                                                                emscripten::val::array(),
+                                                                reshape_output_options);
     }
     // After matrix multiplication the prepended 1 is removed.
     else if (extended_a_shape) {
@@ -78,7 +91,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
         new_shape.push_back(narrow<uint32_t>(b_shape[i]));
       }
       new_shape.push_back(narrow<uint32_t>(b_shape.back()));
-      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                output,
+                                                                emscripten::val::array(new_shape),
+                                                                reshape_output_options);
     }
     // After matrix multiplication the appended 1 is removed.
     else if (extended_b_shape) {
@@ -86,7 +102,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       for (size_t i = 0; i < a_shape.size() - 1; i++) {
         new_shape.push_back(narrow<uint32_t>(a_shape[i]));
       }
-      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                output,
+                                                                emscripten::val::array(new_shape),
+                                                                reshape_output_options);
     }
   } else if (op_type == "MatMulInteger") {
     emscripten::val a_zero_point = emscripten::val::null();
@@ -101,9 +120,13 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     } else {
       b_zero_point = model_builder.GetZeroConstant("uint8");
     }
-    output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger", a, a_zero_point, b, b_zero_point);
+    output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger",
+                                                              a,
+                                                              a_zero_point,
+                                                              b,
+                                                              b_zero_point,
+                                                              options);
   } else {  // Gemm
-    emscripten::val options = emscripten::val::object();
     NodeAttrHelper helper(node);
     const auto transA = helper.Get("transA", 0);
     options.set("aTranspose", emscripten::val(transA == 1));
diff --git a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
index e56e8f6a3eb6d..23f3a938fee5e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
@@ -33,16 +33,18 @@ Status LogicalOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name());
   emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name());
   emscripten::val output = emscripten::val::object();
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   if (op_type == "Equal") {
-    output = model_builder.GetBuilder().call<emscripten::val>("equal", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("equal", input0, input1, options);
   } else if (op_type == "Greater") {
-    output = model_builder.GetBuilder().call<emscripten::val>("greater", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("greater", input0, input1, options);
   } else if (op_type == "GreaterOrEqual") {
-    output = model_builder.GetBuilder().call<emscripten::val>("greaterOrEqual", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("greaterOrEqual", input0, input1, options);
   } else if (op_type == "Less") {
-    output = model_builder.GetBuilder().call<emscripten::val>("lesser", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("lesser", input0, input1, options);
   } else if (op_type == "LessOrEqual") {
-    output = model_builder.GetBuilder().call<emscripten::val>("lesserOrEqual", input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>("lesserOrEqual", input0, input1, options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "LogicalOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
index 0168f59273545..1080fd0a3f943 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc
@@ -43,22 +43,26 @@ Status MaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_NOT(op_type == "Max" || op_type == "Min", "MaxMinOpBuilder, unknown op: ", op_type);
 
   emscripten::val output = emscripten::val::object();
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   if (input_count == 1) {
     // For 1 input, just concat the single input as workaround.
     // TODO: use identity instead once it's available in WebNN.
     emscripten::val inputs = emscripten::val::array();
     inputs.call<void>("push", input0);
-    output = model_builder.GetBuilder().call<emscripten::val>("concat", inputs, 0);
+    output = model_builder.GetBuilder().call<emscripten::val>("concat", inputs, 0, options);
   } else {
     std::string webnn_op_name = op_type == "Max" ? "max" : "min";
 
     emscripten::val input1 = model_builder.GetOperand(input_defs[1]->Name());
-    output = model_builder.GetBuilder().call<emscripten::val>(webnn_op_name.c_str(), input0, input1);
+    output = model_builder.GetBuilder().call<emscripten::val>(webnn_op_name.c_str(), input0, input1, options);
 
     for (size_t input_index = 2; input_index < input_count; ++input_index) {
       emscripten::val next_input = model_builder.GetOperand(input_defs[input_index]->Name());
-      output = model_builder.GetBuilder().call<emscripten::val>(webnn_op_name.c_str(), output, next_input);
+      emscripten::val next_options = emscripten::val::object();
+      next_options.set("label", node.Name() + "_" + input_defs[input_index]->Name());
+      output = model_builder.GetBuilder().call<emscripten::val>(webnn_op_name.c_str(), output, next_input, next_options);
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index a2aa0df5586e3..4d068baf35e72 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -42,6 +42,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
   const auto rank = input_shape.size();
 
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   std::vector<int64_t> scale_shape;
   ORT_RETURN_IF_NOT(GetShape(*input_defs[1], scale_shape, logger), "Cannot get scale shape");
@@ -116,7 +117,12 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
         new_shape.erase(insertion_point, insertion_point + excess_rank);
         *insertion_point = sum;
       }
-      input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+      emscripten::val reshape_input_options = emscripten::val::object();
+      reshape_input_options.set("label", node.Name() + "_reshape_input");
+      input = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                               input,
+                                                               emscripten::val::array(new_shape),
+                                                               reshape_input_options);
     }
 
     if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
@@ -126,8 +132,12 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     // Reshape back to the original output shape for 3D input.
     if (input_shape.size() != 4) {
       std::vector<uint32_t> output_shape = GetVecUint32FromVecInt64(input_shape);
-      output = model_builder.GetBuilder().call<emscripten::val>(
-          "reshape", output, emscripten::val::array(output_shape));
+      emscripten::val reshape_output_options = emscripten::val::object();
+      reshape_output_options.set("label", node.Name() + "reshape_output");
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                output,
+                                                                emscripten::val::array(output_shape),
+                                                                reshape_output_options);
     }
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index bc90821ba4ed8..071155a2fb372 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -73,6 +73,7 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
 
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   NodeAttrHelper helper(node);
   const auto pad_mode = helper.Get("mode", std::string("constant"));
@@ -145,9 +146,12 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       starts.push_back(start_padding[i] >= 0 ? SafeInt<uint32_t>(0) : SafeInt<uint32_t>(-start_padding[i]));
       sizes.push_back(SafeInt<uint32_t>(input_shape[i] + start_padding[i] + end_padding[i]));
     }
+    emscripten::val slice_options = emscripten::val::object();
+    slice_options.set("label", node.Name() + "_slice_output");
     output = model_builder.GetBuilder().call<emscripten::val>("slice", output,
                                                               emscripten::val::array(starts),
-                                                              emscripten::val::array(sizes));
+                                                              emscripten::val::array(sizes),
+                                                              slice_options);
   }
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index 8b3eecf35fcc8..0af62dacedbd5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -59,6 +59,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   }
 
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   NodeAttrHelper helper(node);
 
   const auto kernel_shape = helper.Get("kernel_shape", std::vector<int32_t>{0, 0});
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 461050849385a..3e6d4d9820e9a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -57,6 +57,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   const auto keep_dims = helper.Get("keepdims", 1);
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   options.set("keepDimensions", keep_dims == 1);
   std::vector<int32_t> axes_data;
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
index b5005269b96a7..a7911683f0355 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
@@ -58,8 +58,13 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::transform(target_shape.cbegin(), target_shape.cend(),
                  std::back_inserter(new_shape),
                  [](int64_t dim) -> uint32_t { return SafeInt<int32_t>(dim); });
+
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("reshape",
-                                                                            input, emscripten::val::array(new_shape));
+                                                                            input,
+                                                                            emscripten::val::array(new_shape),
+                                                                            options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index c4ca980fec715..2218c858951d3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -106,6 +106,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   NodeAttrHelper helper(node);
   const auto mode = helper.Get("mode", "nearest");
   if (mode == "linear") {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc
index 1552023d3f876..0eb7dafdffe4d 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc
@@ -55,8 +55,15 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   emscripten::val sizes = emscripten::val::array();
   sizes.call<void>("push", slice_length);
 
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+
   // Since WebNN doesn't support Shape op, we use constant + slice ops as workaround.
-  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("slice", shape_constant, starts, sizes);
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("slice",
+                                                                            shape_constant,
+                                                                            starts,
+                                                                            sizes,
+                                                                            options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index fb452aec1c929..bef13841c646c 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -97,9 +97,12 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                  sizes.begin(),
                  [](int64_t i, int64_t j) { return SafeInt<uint32_t>(i - j); });
 
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("slice", inputs,
                                                                             emscripten::val::array(starts),
-                                                                            emscripten::val::array(sizes));
+                                                                            emscripten::val::array(sizes),
+                                                                            options);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
index 95c1dbd518061..798cfabae65db 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
@@ -42,7 +42,9 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   int32_t axis = helper.Get("axis", default_axis);
   axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_size));
 
-  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("softmax", input, axis);
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("softmax", input, axis, options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index ea3b8ef384ddc..4c59b694d690a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -49,6 +49,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
   const size_t rank = input_shape.size();
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   NodeAttrHelper helper(node);
   int32_t axis = helper.Get("axis", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index 8e6feb62fa8c4..5eff96873b8c4 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -54,7 +54,6 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
   ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
   const auto input_rank = input_shape.size();
 
-  emscripten::val options = emscripten::val::object();
   std::vector<int32_t> axes_data;
   auto rank = input_rank;
 
@@ -111,7 +110,12 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil
                            "SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
   }
 
-  output = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
+  output = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                            input,
+                                                            emscripten::val::array(new_shape),
+                                                            options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
index 841e2d18244d5..2ed8330bf25be 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc
@@ -32,9 +32,11 @@ Status TernaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name());
   emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name());
   emscripten::val input2 = model_builder.GetOperand(node.InputDefs()[2]->Name());
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   emscripten::val output = emscripten::val::object();
   if (op_type == "Where") {
-    output = model_builder.GetBuilder().call<emscripten::val>("where", input0, input1, input2);
+    output = model_builder.GetBuilder().call<emscripten::val>("where", input0, input1, input2, options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "TernaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
index 3921b1da188c3..03c88ad9db88a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
@@ -42,6 +42,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
   std::vector<uint32_t> permutation = GetVecUint32FromVecInt64(perm);
   options.set("permutation", emscripten::val::array(permutation));
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("transpose", input, options);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
index e4b7021d49b30..0c818533918a4 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
@@ -46,6 +46,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   emscripten::val output = emscripten::val::object();
   NodeAttrHelper helper(node);
   emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   const bool upper = helper.Get("upper", 1);
   options.set("upper", upper);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
index e0016de8e69b7..061404c8a9ce0 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
@@ -30,35 +30,37 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
   emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name());
   emscripten::val output = emscripten::val::object();
+  emscripten::val options = emscripten::val::object();
+  options.set("label", node.Name());
 
   if (op_type == "Abs") {
-    output = model_builder.GetBuilder().call<emscripten::val>("abs", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("abs", input, options);
   } else if (op_type == "Ceil") {
-    output = model_builder.GetBuilder().call<emscripten::val>("ceil", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("ceil", input, options);
   } else if (op_type == "Cos") {
-    output = model_builder.GetBuilder().call<emscripten::val>("cos", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("cos", input, options);
   } else if (op_type == "Erf") {
-    output = model_builder.GetBuilder().call<emscripten::val>("erf", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("erf", input, options);
   } else if (op_type == "Exp") {
-    output = model_builder.GetBuilder().call<emscripten::val>("exp", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("exp", input, options);
   } else if (op_type == "Floor") {
-    output = model_builder.GetBuilder().call<emscripten::val>("floor", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("floor", input, options);
   } else if (op_type == "Identity") {
-    output = model_builder.GetBuilder().call<emscripten::val>("identity", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("identity", input, options);
   } else if (op_type == "Log") {
-    output = model_builder.GetBuilder().call<emscripten::val>("log", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("log", input, options);
   } else if (op_type == "Neg") {
-    output = model_builder.GetBuilder().call<emscripten::val>("neg", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("neg", input, options);
   } else if (op_type == "Not") {
-    output = model_builder.GetBuilder().call<emscripten::val>("logicalNot", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("logicalNot", input, options);
   } else if (op_type == "Reciprocal") {
-    output = model_builder.GetBuilder().call<emscripten::val>("reciprocal", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("reciprocal", input, options);
   } else if (op_type == "Sin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("sin", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("sin", input, options);
   } else if (op_type == "Sqrt") {
-    output = model_builder.GetBuilder().call<emscripten::val>("sqrt", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("sqrt", input, options);
   } else if (op_type == "Tan") {
-    output = model_builder.GetBuilder().call<emscripten::val>("tan", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("tan", input, options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "UnaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);

From d8888136e3cdf29fa63d3b0a08a58683a7c9f0a0 Mon Sep 17 00:00:00 2001
From: mingyueliuh <131847423+mingyueliuh@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:45:52 -0400
Subject: [PATCH 44/57] Add support tensor element type for register custom op
 shape infer function (#21387)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
Functionality extension for the SetOutputShape method in custom op shape inference.


### Motivation and Context
-  **SetOutputShape** Interface enhancement Actually, the shape infer function need set the tensor type and shape ，Add a parameter **type** to allow users to specify the tensor type, and set **ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT** as default value to ensure compatibility.

Co-authored-by: mingyue <mingyue@amd.com>
---
 include/onnxruntime/core/session/onnxruntime_cxx_api.h    | 2 +-
 include/onnxruntime/core/session/onnxruntime_cxx_inline.h | 3 ++-
 onnxruntime/core/session/custom_ops.cc                    | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 5d974e1ff5185..29a229f427163 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2216,7 +2216,7 @@ struct ShapeInferContext {
 
   size_t GetInputCount() const { return input_shapes_.size(); }
 
-  Status SetOutputShape(size_t indice, const Shape& shape);
+  Status SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
 
   int64_t GetAttrInt(const char* attr_name);
 
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index aaef111b9f15b..9b9dd81a749c0 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1998,9 +1998,10 @@ inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api,
   }
 }
 
-inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape) {
+inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type) {
   OrtTensorTypeAndShapeInfo* info = {};
   ORT_CXX_RETURN_ON_API_FAIL(ort_api_->CreateTensorTypeAndShapeInfo(&info));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetTensorElementType(info, type));
 
   using InfoPtr = std::unique_ptr<OrtTensorTypeAndShapeInfo, std::function<void(OrtTensorTypeAndShapeInfo*)>>;
 
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 4c782f647371e..33d2a0244b453 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -105,6 +105,7 @@ struct OrtShapeInferContext {
       }
     }
     ONNX_NAMESPACE::updateOutputShape(ctx_, index, shape_proto);
+    ONNX_NAMESPACE::updateOutputElemType(ctx_, index, info->type);
     return onnxruntime::Status::OK();
   }
 

From 05cef469e81e3695667f122beecf97600094d09b Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 30 Jul 2024 00:59:46 +0800
Subject: [PATCH 45/57] Move on-device training packages publish step  (#21539)

### Description
Since the onedevice training cpu packaging has been a separated
pipeline, it's nuget package publishing step must be moved as well.

### Motivation and Context
Fixes the exception in Nuget Publishing Packaging Pipeline caused by
#21485
---
 .../c-api-training-packaging-pipelines.yml    | 27 +++++++++++++++++--
 .../github/azure-pipelines/publish-nuget.yml  |  7 +----
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml
index aecece05a0e58..22ee7de8a5de0 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml
@@ -32,13 +32,25 @@ parameters:
   displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
   type: number
   default: 0
-  
+
+# these 2 parameters are used for debugging.
+- name: SpecificArtifact
+  displayName: Use Specific Artifact (Debugging only)
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Pipeline BuildId, you could find it in the URL
+  type: string
+  default: '0'
+
 stages:
 - template: stages/set_packaging_variables_stage.yml
   parameters:
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
     PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+
 - template: templates/ondevice-training-cpu-packaging-pipeline.yml
   parameters:
     RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
@@ -48,4 +60,15 @@ stages:
     OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training'
     AdditionalBuildFlags: '--enable_training_apis'
     AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
-    BuildVariant: 'default'
\ No newline at end of file
+    BuildVariant: 'default'
+
+- template: templates/publish-nuget-steps.yml
+  parameters:
+    download_artifacts_steps:
+      - template: templates/flex-downloadPipelineArtifact.yml
+        parameters:
+          StepName: 'Download Pipeline Artifact - Signed NuGet Training Package'
+          ArtifactName: 'drop-signed-nuget-Training-CPU'
+          targetPath: '$(Build.BinariesDirectory)/nuget-artifact/final-package'
+          SpecificArtifact: ${{ parameters.specificArtifact }}
+          BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
index 206a9464de6ef..b78d586288ba3 100644
--- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml
+++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
@@ -32,11 +32,6 @@ stages:
           artifact: 'drop-signed-nuget-dml'
         - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-dml\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
 
-        - download: build
-          displayName: 'Download Pipeline Artifact - Signed NuGet Package'
-          artifact: 'drop-signed-nuget-Training-CPU'
-        - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-Training-CPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
-
   # Publish CUDA 11 Nuget/Java pkgs to ADO feed
   - template: stages/nuget-cuda-publishing-stage.yml
     parameters:
@@ -44,4 +39,4 @@ stages:
 
   - template: stages/java-cuda-publishing-stage.yml
     parameters:
-      artifact_feed: $(ArtifactFeed)
\ No newline at end of file
+      artifact_feed: $(ArtifactFeed)

From bc3713206dc1d6c7e5062389ef7db42ac2051a30 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 29 Jul 2024 10:00:21 -0700
Subject: [PATCH 46/57] Update QNN pipeline pool (#21482)

### Description
Update QNN pipeline pool


### Motivation and Context
Let all our pipelines are using the latest NDK version
---
 ...droid-arm64-v8a-QNN-crosscompile-ci-pipeline.yml | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index 6649206c0d79c..c80092fc82ed5 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -35,7 +35,7 @@ parameters:
 
 jobs:
 - job: Build_QNN_EP
-  pool: onnxruntime-qnn-ubuntu-2004-cpu
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 30
   workspace:
     clean: all
@@ -46,6 +46,10 @@ jobs:
     inputs:
       versionSpec: $(pythonVersion)
 
+  - script: |
+      env | grep ANDROID
+    displayName: View Android ENVs
+
   - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
     displayName: Install coreutils and ninja
 
@@ -56,13 +60,6 @@ jobs:
     parameters:
       QnnSDKVersion: ${{ parameters.QnnSdk }}
 
-  - script: |
-      export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk
-      export ANDROID_HOME=/usr/local/lib/android/sdk
-      export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle
-      export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle
-    displayName: set Android ENVs
-
   - script: |
       set -e -x
       rm -rf /tmp/scripts

From 79537d0523a7c215ef1685bf46efbd423242c4c1 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 29 Jul 2024 10:00:52 -0700
Subject: [PATCH 47/57] Remove
 tools/ci_build/github/android/run_nnapi_code_coverage.sh (#21371)

### Description
Remove tools/ci_build/github/android/run_nnapi_code_coverage.sh

### Motivation and Context
This file is no longer needed
---
 .../github/android/run_nnapi_code_coverage.sh | 36 -------------------
 1 file changed, 36 deletions(-)
 delete mode 100755 tools/ci_build/github/android/run_nnapi_code_coverage.sh

diff --git a/tools/ci_build/github/android/run_nnapi_code_coverage.sh b/tools/ci_build/github/android/run_nnapi_code_coverage.sh
deleted file mode 100755
index 472e824eaa47a..0000000000000
--- a/tools/ci_build/github/android/run_nnapi_code_coverage.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-# This script will run ORT build for Android with code coverage option
-
-set -e
-set -x
-
-if [ $# -ne 1 ]; then
-    echo "One command line argument, the ROOT root directory, is expected"
-fi
-
-ORT_ROOT=$1
-# Build and run onnxruntime using NNAPI execution provider targeting android emulator
-python3 ${ORT_ROOT}/tools/ci_build/build.py \
-    --android \
-    --build_dir build_nnapi \
-    --android_sdk_path $ANDROID_HOME \
-    --android_ndk_path $ANDROID_NDK_HOME \
-    --android_abi=x86_64 \
-    --android_api=29 \
-    --skip_submodule_sync \
-    --parallel \
-    --use_nnapi \
-    --cmake_generator=Ninja \
-    --build_java \
-    --path_to_protoc_exe $ORT_ROOT/protobuf_install/bin/protoc \
-    --code_coverage
-
-# Install gcovr
-python3 -m pip install gcovr
-
-# Retrieve runtime code coverage files from the emulator and analyze
-python3 ${ORT_ROOT}/tools/ci_build/coverage.py \
-  --build_dir build_nnapi \
-  --android_sdk_path $ANDROID_HOME
-

From 0d7cf301a1e0ea784edcdf2242e973643f0bb9c9 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Tue, 30 Jul 2024 02:05:34 +0800
Subject: [PATCH 48/57] [js/webgpu] Add activation Tanh (#21540)

Bug:https://github.com/microsoft/onnxruntime/issues/21467

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts |  4 +++
 js/web/test/data/ops/fused-conv.jsonc         | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
index 6e66abacf3471..cfa0b42ef9eeb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts
@@ -30,6 +30,10 @@ export const getActivationSnippet =
               baseType}(uniforms.beta)));`;
         case 'LeakyRelu':
           return `value = select(${baseType}(uniforms.alpha) * value, value, value >= ${valueType}(0.0));`;
+        case 'Tanh':
+          return `let e2x = exp(-2.0 * abs(value));
+              value = sign(value) * (1.0 - e2x) / (1.0 + e2x);
+        `;
         case '':
           return '';
         // TODO: adding other activations that can be fused.
diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc
index 6a10e3b96a26a..d88c91ebc9de7 100644
--- a/js/web/test/data/ops/fused-conv.jsonc
+++ b/js/web/test/data/ops/fused-conv.jsonc
@@ -430,5 +430,38 @@
         ]
       }
     ]
+  },
+  {
+    "name": "fused conv with tanh",
+    "operator": "FusedConv",
+    "attributes": [
+      { "name": "activation", "data": "Tanh", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" }
+    ],
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+            "dims": [1, 1, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [0.11, 0.12, 0.13, 0.14],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.15572261810302734, 0.20409323275089264, 0.29770541191101074, 0.3425688147544861],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From b03c9496aa081fa6c07c5b266800694c830afd60 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 29 Jul 2024 13:39:38 -0700
Subject: [PATCH 49/57] [js/web] allow load WebAssembly binary from buffer
 (#21534)

### Description

This PR adds a new option `ort.env.wasm.wasmBinary`, which allows user
to set to a buffer containing preload .wasm file content.

This PR should resolve the problem from latest discussion in #20876.
---
 cmake/onnxruntime_webassembly.cmake           |  2 +-
 js/common/lib/env.ts                          |  6 +++++
 js/web/lib/wasm/wasm-factory.ts               |  8 ++++++-
 .../e2e/browser-test-wasm-binary-override.js  | 22 +++++++++++++++++++
 js/web/test/e2e/run-data.js                   |  3 +++
 5 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 js/web/test/e2e/browser-test-wasm-binary-override.js

diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 7a49e90c00bce..0686b66876d9f 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -225,7 +225,7 @@ else()
     "SHELL:-s EXPORT_ALL=0"
     "SHELL:-s VERBOSE=0"
     "SHELL:-s FILESYSTEM=0"
-    "SHELL:-s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm,mainScriptUrlOrBlob]"
+    "SHELL:-s INCOMING_MODULE_JS_API=[locateFile,instantiateWasm,wasmBinary]"
     "SHELL:-s WASM_BIGINT=1"
     ${WASM_API_EXCEPTION_CATCHING}
     --no-entry
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index dbb5f8118363f..1a87569a115a6 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -74,6 +74,12 @@ export declare namespace Env {
      */
     wasmPaths?: WasmPrefixOrFilePaths;
 
+    /**
+     * Set a custom buffer which contains the WebAssembly binary. If this property is set, the `wasmPaths` property will
+     * be ignored.
+     */
+    wasmBinary?: ArrayBufferLike|Uint8Array;
+
     /**
      * Set or get a boolean value indicating whether to proxy the execution of main thread to a worker thread.
      *
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index fb068ab42d04c..0f5f10716a00b 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -108,6 +108,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const mjsPathOverride = (mjsPathOverrideFlag as URL)?.href ?? mjsPathOverrideFlag;
   const wasmPathOverrideFlag = (wasmPaths as Env.WasmFilePaths)?.wasm;
   const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag;
+  const wasmBinaryOverride = flags.wasmBinary;
 
   const [objectUrl, ortWasmFactory] = (await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1));
 
@@ -135,7 +136,12 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
       numThreads,
     };
 
-    if (wasmPathOverride || wasmPrefixOverride) {
+    if (wasmBinaryOverride) {
+      /**
+       * Set a custom buffer which contains the WebAssembly binary. This will skip the wasm file fetching.
+       */
+      config.wasmBinary = wasmBinaryOverride;
+    } else if (wasmPathOverride || wasmPrefixOverride) {
       /**
        * A callback function to locate the WebAssembly file. The function should return the full path of the file.
        *
diff --git a/js/web/test/e2e/browser-test-wasm-binary-override.js b/js/web/test/e2e/browser-test-wasm-binary-override.js
new file mode 100644
index 0000000000000..35d427fa3b722
--- /dev/null
+++ b/js/web/test/e2e/browser-test-wasm-binary-override.js
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const documentUrl = document.currentScript.src;
+
+it('Browser E2E testing - WebAssembly backend', async function() {
+  // preload .wasm file binary
+  const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.wasm', documentUrl).href;
+  const response = await fetch(wasmUrl);
+
+  // make sure the .wasm file is loaded successfully
+  assert(response.ok);
+  assert(response.headers.get('Content-Type') === 'application/wasm');
+
+  // override wasm binary
+  const binary = await response.arrayBuffer();
+  ort.env.wasm.wasmBinary = binary;
+
+  await testFunction(ort, {executionProviders: ['wasm']});
+});
diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js
index 507192f29be9c..856f29eac6ddf 100644
--- a/js/web/test/e2e/run-data.js
+++ b/js/web/test/e2e/run-data.js
@@ -36,6 +36,9 @@ const BROWSER_TEST_CASES = [
   [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2', 'proxy=1']],  // 2 threads, proxy
   [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1', 'proxy=1']],  // 1 thread, proxy
 
+  // wasm binary override:
+  [true, false, './browser-test-wasm-binary-override.js', 'ort.min.js'],
+
   // path override:
   // wasm, path override filenames for both mjs and wasm, same origin
   [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],

From c39f1c4fd80668fd7619719ebe7a374f4ae11a5e Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Mon, 29 Jul 2024 14:12:36 -0700
Subject: [PATCH 50/57] ORT- OVEP 1.19 PR-follow up (#21546)

### Description
Follow up PR for bug fixes on 1.19


### Motivation and Context

- Handles 1.19 docker file fixes.
- Sets the default file naming of epctx onnx model with _ctx.onnx as
suffix.
- Create epctx model directories if it doesn't exist.

---------

Co-authored-by: jatinwadhwa921 <110383850+jatinwadhwa921@users.noreply.github.com>
---
 dockerfiles/Dockerfile.openvino               | 10 ++++------
 .../providers/openvino/backend_manager.cc     |  9 ++++++++-
 .../openvino/openvino_execution_provider.cc   |  5 -----
 .../openvino/openvino_provider_factory.cc     | 20 ++++++++++++++++++-
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 75898770acf28..39e75a68a369f 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -3,11 +3,11 @@
 # SPDX-License-Identifier: MIT
 #--------------------------------------------------------------------------
 
-ARG OPENVINO_VERSION=2024.0.0
+ARG OPENVINO_VERSION=2024.2.0
 
 
 # Build stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS builder
+FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION} AS builder
 
 ENV WORKDIR_PATH=/home/openvino
 WORKDIR $WORKDIR_PATH
@@ -34,20 +34,18 @@ RUN cat /etc/apt/sources.list | sed 's/^# deb-src/deb-src/g' > ./temp; mv temp /
 RUN apt update; apt install dpkg-dev
 RUN mkdir /sources
 WORKDIR /sources
-RUN apt-get source cron iso-codes lsb-release powermgmt-base python-apt-common python3-apt python3-dbus python3-gi unattended-upgrades libapt-pkg6.0 libhogweed5 libnettle7
+RUN apt-get source cron iso-codes lsb-release powermgmt-base python-apt-common python3-apt python3-dbus python3-gi libapt-pkg6.0 libhogweed6 libnettle8
 WORKDIR /
 RUN tar cvf GPL_sources.tar.gz /sources
 
 # Deploy stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION}
+FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION}
 
 ENV DEBIAN_FRONTEND noninteractive
 USER root
 COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/dist/*.whl ./
 COPY --from=builder /GPL_sources.tar.gz ./
 RUN python3 -m pip install ./*.whl && rm ./*.whl
-RUN apt update; apt install -y unattended-upgrades && \
-    unattended-upgrade
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
 RUN adduser --uid $BUILD_UID $BUILD_USER
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 8f3658df0d09d..18a6257910a56 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -128,6 +128,13 @@ BackendManager::BackendManager(const GlobalContext& global_context,
 #endif
     }
   }
+  if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
+    auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph,
+                                                                                          logger);
+    if ((!status.IsOK())) {
+      ORT_THROW(status);
+    }
+  }
 }
 
 // Call EPContext model exporter here if the provider option for exporting
@@ -158,7 +165,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
       if (dot == std::string::npos) return graph_name;
       return graph_name.substr(0, dot);
     }();
-    graph_name = graph_name + "-ov_" + GetGlobalContext().device_type + "_blob.onnx";
+    graph_name = graph_name + "_ctx.onnx";
   }
   // If embed_mode, then pass on the serialized blob
   // If not embed_mode, dump the blob here and only pass on the path to the blob
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 5627cb2c122fb..29c45916795d3 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -147,11 +147,6 @@ common::Status OpenVINOExecutionProvider::Compile(
                                                       *GetLogger(),
                                                       ep_ctx_handle_);
 
-    if (global_context_->export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) {
-      ORT_RETURN_IF_ERROR(backend_manager->ExportCompiledBlobAsEPCtxNode(graph_body_viewer,
-                                                                         *GetLogger()));
-    }
-
     compute_info.create_state_func =
         [backend_manager](ComputeContext* context, FunctionState* state) {
           OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState();
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 716a7cd936405..3738f2a534154 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -192,6 +192,10 @@ struct OpenVINO_Provider : Provider {
     }
 
     if (provider_options_map.find("num_of_threads") != provider_options_map.end()) {
+      if (!std::all_of(provider_options_map.at("num_of_threads").begin(),
+                       provider_options_map.at("num_of_threads").end(), ::isdigit)) {
+        ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n");
+      }
       num_of_threads = std::stoi(provider_options_map.at("num_of_threads"));
       if (num_of_threads <= 0) {
         num_of_threads = 1;
@@ -298,7 +302,21 @@ struct OpenVINO_Provider : Provider {
       // The path to dump epctx model is valid only when epctx is enabled.
       // Overrides the cache_dir option to dump model cache files from OV.
       if (export_ep_ctx_blob) {
-        cache_dir = provider_options_map.at("so_epctx_path").c_str();
+        auto ep_context_file_path_ = provider_options_map.at("so_epctx_path");
+        auto file_path = std::filesystem::path(ep_context_file_path_);
+        // ep_context_file_path_ file extension must be .onnx
+        if (!ep_context_file_path_.empty() &&
+            file_path.extension().generic_string() == ".onnx") {
+          // ep_context_file_path_ must be provided as a directory, create it if doesn't exist
+          auto parent_path = file_path.parent_path();
+          if (!std::filesystem::is_directory(parent_path) &&
+              !std::filesystem::create_directory(parent_path)) {
+            ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " + file_path.parent_path().generic_string() + " \n");
+          }
+          cache_dir = ep_context_file_path_.c_str();
+        } else {
+          ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + ep_context_file_path_ + " \n");
+        }
       }
     }
 

From 7543dd040b2d32109a2718d7276d3aca1edadaae Mon Sep 17 00:00:00 2001
From: Adam Reeve <adreeve@gmail.com>
Date: Tue, 30 Jul 2024 10:50:13 +1200
Subject: [PATCH 51/57] Propagate NaNs in the CPU min and max operators
 (#21492)

### Description

Propagates NaN values in the min and max operators so that min or max
with a NaN in either input always produces NaN.

### Motivation and Context

Fixes #21455
---
 .../providers/cpu/math/element_wise_ops.cc    |  18 +-
 onnxruntime/test/providers/checkers.cc        |   2 +-
 .../cpu/math/element_wise_ops_test.cc         | 188 ++++++++++++++++--
 3 files changed, 187 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 1d524a90302e7..5ea6000da1cba 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -705,7 +705,7 @@ Status Min_6<float>::Compute(OpKernelContext* ctx) const {
   for (int index = 1; index < inputCount; index++) {
     auto& data_n = *ctx->Input<Tensor>(index);
     ORT_ENFORCE(data_n.Shape() == shape, "All inputs must have the same shape");
-    min = min.array().min(EigenMap<float>(data_n).array());
+    min = min.array().template min<Eigen::PropagateNaN>(EigenMap<float>(data_n).array());
   }
 
   return Status::OK();
@@ -721,15 +721,16 @@ struct Min_8::ComputeImpl {
     ProcessBroadcastSpanFuncs funcs{
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput1<T>().array().min(per_iter_bh.ScalarInput0<T>());
+              per_iter_bh.EigenInput1<T>().array().template min<Eigen::PropagateNaN>(per_iter_bh.ScalarInput0<T>());
         },
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput0<T>().array().min(per_iter_bh.ScalarInput1<T>());
+              per_iter_bh.EigenInput0<T>().array().template min<Eigen::PropagateNaN>(per_iter_bh.ScalarInput1<T>());
         },
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput0<T>().array().min(per_iter_bh.EigenInput1<T>().array());
+              per_iter_bh.EigenInput0<T>().array().template min<Eigen::PropagateNaN>(
+                  per_iter_bh.EigenInput1<T>().array());
         }};
 
     int input_count = inst.Node().InputArgCount().front();
@@ -827,7 +828,7 @@ Status Max_6<float>::Compute(OpKernelContext* ctx) const {
   for (int index = 1; index < inputCount; index++) {
     auto& data_n = *ctx->Input<Tensor>(index);
     ORT_ENFORCE(data_n.Shape() == shape, "All inputs must have the same shape");
-    max = max.array().max(EigenMap<float>(data_n).array());
+    max = max.array().template max<Eigen::PropagateNaN>(EigenMap<float>(data_n).array());
   }
 
   return Status::OK();
@@ -843,15 +844,16 @@ struct Max_8::ComputeImpl {
     ProcessBroadcastSpanFuncs funcs{
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput1<T>().array().max(per_iter_bh.ScalarInput0<T>());
+              per_iter_bh.EigenInput1<T>().array().template max<Eigen::PropagateNaN>(per_iter_bh.ScalarInput0<T>());
         },
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput0<T>().array().max(per_iter_bh.ScalarInput1<T>());
+              per_iter_bh.EigenInput0<T>().array().template max<Eigen::PropagateNaN>(per_iter_bh.ScalarInput1<T>());
         },
         [](BroadcastHelper& per_iter_bh) {
           per_iter_bh.OutputEigen<T>() =
-              per_iter_bh.EigenInput0<T>().array().max(per_iter_bh.EigenInput1<T>().array());
+              per_iter_bh.EigenInput0<T>().array().template max<Eigen::PropagateNaN>(
+                  per_iter_bh.EigenInput1<T>().array());
         }};
 
     int input_count = inst.Node().InputArgCount().front();
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index 5f332ddcddb8d..182fa4729a88f 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -427,7 +427,7 @@ struct TensorCheck<MLFloat16> {
 
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
-        EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
+        EXPECT_TRUE(std::isnan(f_actual[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index eb3575f2cde88..bd3d21d4929f3 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -1553,6 +1553,47 @@ TEST(MathOpTest, Min_12_Float_Nan) {
   }
 }
 
+TEST(MathOpTest, Min_12_Float_Nan_with_scalar) {
+  OpTester test("Min", 12);
+  test.AddInput<float>("data_1", {3, 1},
+                       {std::numeric_limits<float>::quiet_NaN(), -0.5f, 0.5f});
+  test.AddInput<float>("data_2", {1}, {0.25f});
+  test.AddOutput<float>("min", {3, 1},
+                        {std::numeric_limits<float>::quiet_NaN(), -0.5f, 0.25f});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Min_12_Float_with_scalar_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<float>("data_1", {2, 2},
+                       {0.25f, -0.25f, -0.5f, 0.5f});
+  test.AddInput<float>("data_2", {1}, {std::numeric_limits<float>::quiet_NaN()});
+  test.AddOutput<float>("min", {2, 2},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN()});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Double) {
   OpTester test("Min", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1586,12 +1627,53 @@ TEST(MathOpTest, Min_12_Double_Nan) {
                           std::numeric_limits<double>::quiet_NaN(),
                           -1.0, -1.0, -2.0,
                           0.5, 0.0, 1.0});
-  if (nullptr != DefaultCpuExecutionProvider().get()) {
+  if (nullptr != DefaultCpuExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCpuExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
-  if (nullptr != DefaultCudaExecutionProvider().get()) {
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Min_12_Double_Nan_with_scalar) {
+  OpTester test("Min", 12);
+  test.AddInput<double>("data_1", {3, 1},
+                        {std::numeric_limits<double>::quiet_NaN(), -0.5, 0.5});
+  test.AddInput<double>("data_2", {1}, {0.25});
+  test.AddOutput<double>("min", {3, 1},
+                         {std::numeric_limits<double>::quiet_NaN(), -0.5, 0.25});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Min_12_Double_with_scalar_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<double>("data_1", {2, 2},
+                        {0.25, -0.25, -0.5, 0.5});
+  test.AddInput<double>("data_2", {1}, {std::numeric_limits<double>::quiet_NaN()});
+  test.AddOutput<double>("min", {2, 2},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN()});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCudaExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
@@ -1666,7 +1748,7 @@ TEST(MathOpTest, Min_12_UInt64) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Min_12_MLFLoat16) {
+TEST(MathOpTest, Min_12_MLFloat16) {
   OpTester test("Min", 12);
   test.AddInput<MLFloat16>("data_0", {1, 3},
                            MakeMLFloat16({1.f, 1.f, 1.f}));
@@ -1679,7 +1761,7 @@ TEST(MathOpTest, Min_12_MLFLoat16) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Min_12_MLFLoat16_Scalar0) {
+TEST(MathOpTest, Min_12_MLFloat16_Scalar0) {
   OpTester test("Min", 12);
   test.AddInput<MLFloat16>("data_0", {},
                            MakeMLFloat16({-10.f}));
@@ -1692,7 +1774,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar0) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) {
+TEST(MathOpTest, Min_12_MLFloat16_Scalar1) {
   OpTester test("Min", 12);
   test.AddInput<MLFloat16>("data_0", {1, 3},
                            MakeMLFloat16({2.f, 3.f, 4.f}));
@@ -1809,12 +1891,53 @@ TEST(MathOpTest, Max_12_Float_Nan) {
                          std::numeric_limits<float>::quiet_NaN(),
                          -0.5f, 0.0f, -1.0f,
                          1.0f, 1.0f, 2.0f});
-  if (nullptr != DefaultCpuExecutionProvider().get()) {
+  if (nullptr != DefaultCpuExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCpuExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
-  if (nullptr != DefaultCudaExecutionProvider().get()) {
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Max_12_Float_Nan_with_scalar) {
+  OpTester test("Max", 12);
+  test.AddInput<float>("data_1", {3, 1},
+                       {std::numeric_limits<float>::quiet_NaN(), -0.5f, 0.5f});
+  test.AddInput<float>("data_2", {1}, {0.25f});
+  test.AddOutput<float>("max", {3, 1},
+                        {std::numeric_limits<float>::quiet_NaN(), 0.25f, 0.5f});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Max_12_Float_with_scalar_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<float>("data_1", {2, 2},
+                       {0.25f, -0.25f, -0.5f, 0.5f});
+  test.AddInput<float>("data_2", {1}, {std::numeric_limits<float>::quiet_NaN()});
+  test.AddOutput<float>("max", {2, 2},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN()});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCudaExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
@@ -1854,12 +1977,53 @@ TEST(MathOpTest, Max_12_Double_Nan) {
                           std::numeric_limits<double>::quiet_NaN(),
                           -0.5, 0.0, -1.0,
                           1.0, 1.0, 2.0});
-  if (nullptr != DefaultCpuExecutionProvider().get()) {
+  if (nullptr != DefaultCpuExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCpuExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
-  if (nullptr != DefaultCudaExecutionProvider().get()) {
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Max_12_Double_Nan_with_scalar) {
+  OpTester test("Max", 12);
+  test.AddInput<double>("data_1", {3, 1},
+                        {std::numeric_limits<double>::quiet_NaN(), -0.5, 0.5});
+  test.AddInput<double>("data_2", {1}, {0.25});
+  test.AddOutput<double>("max", {3, 1},
+                         {std::numeric_limits<double>::quiet_NaN(), 0.25, 0.5});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
+TEST(MathOpTest, Max_12_Double_with_scalar_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<double>("data_1", {2, 2},
+                        {0.25, -0.25, -0.5, 0.5});
+  test.AddInput<double>("data_2", {1}, {std::numeric_limits<double>::quiet_NaN()});
+  test.AddOutput<double>("max", {2, 2},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN()});
+  if (nullptr != DefaultCpuExecutionProvider()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider()) {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCudaExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
@@ -1934,7 +2098,7 @@ TEST(MathOpTest, Max_12_UInt64) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Max_12_MLFLoat16) {
+TEST(MathOpTest, Max_12_MLFloat16) {
   OpTester test("Max", 12);
   test.AddInput<MLFloat16>("data_0", {1, 3},
                            MakeMLFloat16({-1.f, -1.f, -1.f}));
@@ -1947,7 +2111,7 @@ TEST(MathOpTest, Max_12_MLFLoat16) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Max_12_MLFLoat16_Scalar0) {
+TEST(MathOpTest, Max_12_MLFloat16_Scalar0) {
   OpTester test("Max", 12);
   test.AddInput<MLFloat16>("data_0", {},
                            MakeMLFloat16({-1.f}));
@@ -1960,7 +2124,7 @@ TEST(MathOpTest, Max_12_MLFLoat16_Scalar0) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
-TEST(MathOpTest, Max_12_MLFLoat16_Scalar1) {
+TEST(MathOpTest, Max_12_MLFloat16_Scalar1) {
   OpTester test("Max", 12);
   test.AddInput<MLFloat16>("data_0", {1, 3},
                            MakeMLFloat16({-1.f, -2.f, -3.f}));

From d98581495f996084af65ae1e6600378bed949460 Mon Sep 17 00:00:00 2001
From: Sophie Schoenmeyer <107952697+sophies927@users.noreply.github.com>
Date: Mon, 29 Jul 2024 16:06:03 -0700
Subject: [PATCH 52/57] Update labeling bot (#21548)

Current labeling bot over-applies many of the labels (e.g., ep:CUDA and
platform:windows) and is missing some of the APIs + EPs

Working on migrating this workflow to GitHub policies but would like to
use this fix in the meantime to avoid causing any issues w/ ORT 1.19

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .github/labeler.yml                      | 31 ++++++++++++++----------
 .github/title-only-labeler.yml           |  4 +++
 .github/workflows/title-only-labeler.yml | 20 +++++++++++++++
 3 files changed, 42 insertions(+), 13 deletions(-)
 create mode 100644 .github/title-only-labeler.yml
 create mode 100644 .github/workflows/title-only-labeler.yml

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 526d8a643e713..c14e2a213bc60 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,20 +1,25 @@
-api:javascript: '/\bjavascript\b/i'
+api:CSharp: '/(\bc\s*sharp\b|\bc#)/i'
 api:java: '/\bjava\b/i'
+api:javascript: '/\bjavascript\b/i'
 ep:ACL: '/\bacl\b/i'
 ep:ArmNN: '/\barmnn\b/i'
-ep:CUDA: '/\bcuda\b/i'
-ep:DML: '/(\bdirectml\b|\bdml\b)/i'
-ep:MIGraphX: '/\bmigraphx\b/i'
-ep:oneDNN: '/\bonednn\b/i'
+ep:CANN: '/\bcann\b/i'
+ep:CoreML: '/\bcore\s*ml\b/i'
+ep:DML: '/(\bdirect\s*ml\b|\bdml\b)/i'
+ep:MIGraphX: '/\bmi\s*graph\s*x\b/i'
+ep:oneDNN: '/\bone\s*dnn\b/i'
 ep:OpenVINO: '/\bopen\s*vino\b/i'
-ep:RockchipNPU: '/\brockchip\b/i'
+ep:QNN: '/\bqnn\b/i'
+ep:RockchipNPU: '/\brockchip(?:npu)?\b/i'
 ep:ROCm: '/\brocm\b/i'
-ep:TensorRT: '/(\btensor\s*rt\b|\btrt\b)/i'
+ep:SNPE: '/\bsnpe\b/i'
 ep:tvm: '/\btvm\b/i'
 ep:VitisAI: '/\bvitis(?:ai)?\b/i'
-platform:jetson: '/\bjetson\b/i'
-platform:mobile: '/(\bobj(?:ective)?-?c\b|\bnnapi\b|\bcore-?ml\b|\bmobile\b|\bandroid\b|\bios\b|\bxamarin\b|\bmaui\b)/i'
-platform:web: '/(\bwebgl\b|\bweb-?gpu\b|\bwasm\b|\bonnxruntime-node\b|\bonnxruntime-web\b)/i'
-platform:windows: '/(\bwindows\b|\bwinrt\b|\bwinml\b)/i'
-model:transformer: '/(\bbert\b|\bgpt-?2\b|\bhugging-?face\b|\blong-?former\b|\bt5\b)/i'
-quantization: '/(is this a quantized model\?\n\nYes|\bquantization\b)/i'
+ep:WebGPU: '/\bwebgpu\b/i'
+ep:WebNN: '/\bwebnn\b/i'
+ep:Xnnpack: '/\bxnn\s*pack\b/i'
+.NET: '/(\bdot\s*net\b|\bnuget\b|\.net\b)/i'
+platform:jetson: '/(\bjetson\b|\bjetpack\b)/i'
+platform:mobile: '/(\bobj(?:ective)?-?c\b|\bnnapi\b|\bmobile\b|\bandroid\b|\bios\b|\bxamarin\b|\bmaui\b)/i'
+platform:web: '/(\bwebgl\b|\bweb-?gpu\b|\bwasm\b|\bonnxruntime-node\b|\bonnxruntime-web\b|\bonnxruntime-react-native\b|\bnpm\b|\btransformers\.js\b)/i'
+model:transformer: '/\btransformers(?!\.js)\b/i'
diff --git a/.github/title-only-labeler.yml b/.github/title-only-labeler.yml
new file mode 100644
index 0000000000000..4980f7251bcb4
--- /dev/null
+++ b/.github/title-only-labeler.yml
@@ -0,0 +1,4 @@
+ep:CUDA: '/\bcuda\b/i'
+ep:TensorRT: '/(\btensor\s*rt\b|\btrt\b)/i'
+platform:windows: '/(\bwindows\b|\bwinrt\b|\bwinml\b)/i'
+quantization: '/(quant|\bqdq\b)/i'
diff --git a/.github/workflows/title-only-labeler.yml b/.github/workflows/title-only-labeler.yml
new file mode 100644
index 0000000000000..e0af2dd06b1b7
--- /dev/null
+++ b/.github/workflows/title-only-labeler.yml
@@ -0,0 +1,20 @@
+name: "Title Only Issue Labeler"
+on:
+  issues:
+    types: [opened, edited]
+
+permissions:
+  issues: write
+
+jobs:
+  triage:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: github/issue-labeler@v3.4
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
+        configuration-path: .github/title-only-labeler.yml
+        not-before: 2020-01-15T02:54:32Z
+        enable-versioned-regex: 0
+        include-title: 1
+        include-body: 0

From 8417c325ec160dc8ee62edaf6d1daf91ad979d56 Mon Sep 17 00:00:00 2001
From: mcollinswisc <maxwell.collins@tetramem.com>
Date: Mon, 29 Jul 2024 16:06:51 -0700
Subject: [PATCH 53/57] Keep QDQ nodes w/ nonpositive scale around MaxPool
 (#21182)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
This change adds a check for whether the scale in the QuantizeLinear (or
DequantizeLinear) is a positive scalar, and a new selector to disallow
removing the QDQ around MaxPool if it is not.

### Motivation and Context
Currently, the DropQDQNodesRules optimization removes QuantizeLinear and
DequantizeLinear nodes from DequantizeLinear ∘ MaxPool ∘ QuantizeLinear.
However, if the x_scale/y_scale values are non-positive, the
(de-)quantization changes the ordering of the elements in the input
value, so this optimization is changing the results.


https://github.com/microsoft/onnxruntime/issues/21176

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../optimizer/qdq_transformer/qdq_util.cc     | 35 ++++++++++++++
 .../core/optimizer/qdq_transformer/qdq_util.h |  4 ++
 .../qdq_selector_action_transformer.cc        | 27 +++++++++--
 .../selectors_actions/qdq_selectors.cc        |  7 +++
 .../selectors_actions/qdq_selectors.h         | 10 ++--
 .../test/optimizer/qdq_transformer_test.cc    | 46 +++++++++++++++++++
 6 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
index a4d1ea3c7cf56..7ef4ced1835f0 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
@@ -166,6 +166,41 @@ bool QOrDQNodeHasConstantScalarScaleAndZeroPoint(
   return true;
 }
 
+bool IsQOrDQScalePositiveConstantScalar(
+    const Node& q_or_dq_node, const GetConstantInitializerFn& get_const_initializer,
+    const std::filesystem::path& model_path) {
+  auto q_or_dq_input_defs = q_or_dq_node.InputDefs();
+
+  ORT_ENFORCE(q_or_dq_input_defs.size() >= 2);
+
+  if (!optimizer_utils::IsScalar(*q_or_dq_input_defs[InputIndex::SCALE_ID])) {
+    return false;
+  }
+
+  const ONNX_NAMESPACE::TensorProto* q_or_dq_scale_tensor_proto =
+      get_const_initializer(q_or_dq_input_defs[InputIndex::SCALE_ID]->Name());
+  if (nullptr == q_or_dq_scale_tensor_proto) {
+    return false;
+  }
+
+  Initializer q_or_dq_scale(*q_or_dq_scale_tensor_proto, model_path);
+
+  switch (q_or_dq_scale.data_type()) {
+    case ONNX_NAMESPACE::TensorProto::FLOAT:
+      return q_or_dq_scale.data<float>()[0] > 0;
+
+    case ONNX_NAMESPACE::TensorProto::FLOAT16:
+      return q_or_dq_scale.data<MLFloat16>()[0] > 0;
+
+    case ONNX_NAMESPACE::TensorProto::BFLOAT16:
+      return q_or_dq_scale.data<BFloat16>()[0] > 0;
+
+    default:
+      assert(false);
+      return false;
+  }
+}
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
 bool MatchQNode(const Node& node) {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
index 5d11b8bfd5558..008f9972a143b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h
@@ -65,6 +65,10 @@ bool QOrDQNodeHasConstantScalarScaleAndZeroPoint(
     const GetConstantInitializerFn& get_const_initializer,
     bool& zero_point_exists);
 
+// Checks that the y_scale/x_scale input to the QuantizeLinear/DequantizeLinear node is a positive scalar.
+bool IsQOrDQScalePositiveConstantScalar(const Node& q_or_dq_node, const GetConstantInitializerFn& get_const_initializer,
+                                        const std::filesystem::path& model_path);
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 // Check Q node op type, version, and domain.
 bool MatchQNode(const Node& node);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 17e66a3953b97..d81701fdf443b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -35,6 +35,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
   // 3 nodes. DQ, target, Q. Merge into target and remove DQ and Q.
   const std::string drop_action_name{"drop"};
   const std::string drop_action_no_int16_name{"drop_no_int16_support"};
+  const std::string drop_action_no_int16_and_positive_scale_name{"drop_no_int16_support_and_positive_scale"};
   NTO::NodeLocation dq{NTO::NodeType::kInput, 0};
   NTO::NodeLocation q{NTO::NodeType::kOutput, 0};
 
@@ -46,19 +47,32 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
 
   std::unique_ptr<Action> drop_action_no_int16 = std::make_unique<MergeIntoTargetFixed>(
       std::vector<NodeAndMoveInfo>(moves));  // Copy before std::move(moves)
+  std::unique_ptr<Action> drop_action_no_int16_and_positive_scale = std::make_unique<MergeIntoTargetFixed>(
+      std::vector<NodeAndMoveInfo>(moves));  // Copy before std::move(moves)
   std::unique_ptr<Action> drop_action = std::make_unique<MergeIntoTargetFixed>(std::move(moves));
 
 #if !defined(ORT_MINIMAL_BUILD)
-  // Use a separate selector + action that disallows 16-bit types for MaxPool and Resize.
+  // Use separate selectors & actions for MaxPool and Resize.
+  //
+  // They disallow 16-bit types for MaxPool and Resize:
   // int16 MaxPool is not supported by the ONNX specification.
   // int16 Resize is not supported by the ORT implementation (although allowed by ONNX).
-  std::unique_ptr<NodeSelector> selector_disallow_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false);
+  //
+  // And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative
+  // scale will change the ordering of the elements between quantized & de-quantized values.
+  std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false);
   qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name,
-                                                         {{"MaxPool", {12}},
-                                                          {"Resize", {}}},
-                                                         std::move(selector_disallow_16bit),
+                                                         {{"Resize", {}}},
+                                                         std::move(selector_no_16bit),
                                                          std::move(drop_action_no_int16));
 
+  std::unique_ptr<NodeSelector> selector_no_16bit_and_positive_scale =
+      std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false);
+  qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name,
+                                                         {{"MaxPool", {12}}},
+                                                         std::move(selector_no_16bit_and_positive_scale),
+                                                         std::move(drop_action_no_int16_and_positive_scale));
+
   std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true);
   qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name,
                                                          {{"Gather", {}},
@@ -70,6 +84,9 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
                                                          std::move(drop_action));
 #else
   qdq_selector_action_registry.RegisterAction(drop_action_no_int16_name, std::move(drop_action_no_int16));
+  qdq_selector_action_registry.RegisterAction(
+      drop_action_no_int16_and_positive_scale_name,
+      std::move(drop_action_no_int16_and_positive_scale));
   qdq_selector_action_registry.RegisterAction(drop_action_name, std::move(drop_action));
 #endif
 }
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index e271ae8df3356..203aba2c3dd91 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -150,6 +150,13 @@ bool DropQDQNodeGroupSelector::Check(const GraphViewer& graph_viewer,
     return graph_viewer.GetConstantInitializer(initializer_name, true);
   };
 
+  if (!allow_nonpositive_scale_) {
+    // IsQDQPairSupported will check that the scale is the same between q_node and dq_node.
+    if (!IsQOrDQScalePositiveConstantScalar(q_node, get_const_initializer, graph_viewer.ModelPath())) {
+      return false;
+    }
+  }
+
   return IsQDQPairSupported(q_node, dq_node, get_const_initializer, graph_viewer.ModelPath());
 }
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 491a15b62cb03..7e009da39403b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -48,8 +48,9 @@ class NodeGroupSelector {
 // Zero point and scale are constant scalars and must match
 class DropQDQNodeGroupSelector : public NodeGroupSelector {
  public:
-  explicit DropQDQNodeGroupSelector(bool allow_16bit = true, bool allow_4bit = true)
-      : allow_16bit_(allow_16bit), allow_4bit_(allow_4bit) {}
+  explicit DropQDQNodeGroupSelector(bool allow_16bit = true, bool allow_4bit = true,
+                                    bool allow_nonpositive_scale = true)
+      : allow_16bit_(allow_16bit), allow_4bit_(allow_4bit), allow_nonpositive_scale_(allow_nonpositive_scale) {}
 
  private:
   bool Check(const GraphViewer& graph_viewer, const Node& node,
@@ -58,6 +59,7 @@ class DropQDQNodeGroupSelector : public NodeGroupSelector {
 
   bool allow_16bit_;
   bool allow_4bit_;
+  bool allow_nonpositive_scale_;
 };
 
 // Single DQ -> node.
@@ -300,8 +302,8 @@ class BaseSelector : public NodeSelector {
 
 class DropQDQNodesSelector : public BaseSelector {
  public:
-  explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false)
-      : BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit)) {}
+  explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true)
+      : BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale)) {}
 };
 
 class DropDQNodesSelector : public BaseSelector {
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 367b4a65e3b7b..a043d6553bdfd 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -980,6 +980,52 @@ TEST(QDQTransformerTests, ReshapeDropQDQ) {
   RunReshapeDropQDQTestCase<uint16_t>({1, 3, 2, 2}, {1, 12}, false, 21);  // Use int16 ONNX QDQ ops
 }
 
+// Runs a test case that checks if Q/DQ nodes are *not* dropped from DQ -> MaxPool -> Q if the quantization scale is
+// negative.
+template <typename QuantType>
+static void RunMaxPoolNegativeScaleDropQDQTestCase() {
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    constexpr QuantType qmin = std::numeric_limits<QuantType>::min();
+    constexpr QuantType qmax = std::numeric_limits<QuantType>::max();
+
+    const std::vector<int64_t> input_shape = {1, 17, 17, 3};
+    auto* input_arg = builder.MakeInput<QuantType>(input_shape, qmin, qmax);
+    auto* output_arg = builder.MakeOutput();
+
+    constexpr float scale = -0.003f;
+    QuantType zero_point = 1 + (qmax + qmin) / 2;
+
+    auto* input_arg_dq = builder.MakeIntermediate();
+    auto* maxpool_output = builder.MakeIntermediate();
+
+    builder.AddDequantizeLinearNode<QuantType>(input_arg, scale, zero_point, input_arg_dq);
+
+    Node& maxpool_node = builder.AddNode("MaxPool", {input_arg_dq}, {maxpool_output});
+    maxpool_node.AddAttribute("auto_pad", "VALID");
+    maxpool_node.AddAttribute("kernel_shape", std::vector<int64_t>({2, 2}));
+
+    builder.AddQuantizeLinearNode<QuantType>(maxpool_output, scale, zero_point, output_arg);
+  };
+
+  auto check_graph = [](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["MaxPool"], 1);
+    EXPECT_EQ(op_to_count["QuantizeLinear"], 1);
+    EXPECT_EQ(op_to_count["DequantizeLinear"], 1);
+  };
+
+  constexpr int opset = 21;
+  TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset);
+}
+
+// Checks that Q/DQ nodes are *not* dropped from DQ -> MaxPool -> Q for negative scale. Uses 8-bit and 16-bit Q/DQ ops.
+TEST(QDQTransformerTests, MaxpoolDontDropQDQForNegativeScale) {
+  RunMaxPoolNegativeScaleDropQDQTestCase<int8_t>();
+  RunMaxPoolNegativeScaleDropQDQTestCase<uint8_t>();
+  RunMaxPoolNegativeScaleDropQDQTestCase<int16_t>();
+  RunMaxPoolNegativeScaleDropQDQTestCase<uint16_t>();
+}
+
 // Runs a test case that checks if Q/DQ nodes are dropped from DQ -> (Un)Squeeze -> Q.
 template <typename QuantType>
 static void RunSqueezeUnsqueezeDropQDQTestCase(const std::string& squeeze_type,

From 5d78b9a17bb6d126f8ae7fa7eef05cabe4a08dae Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Mon, 29 Jul 2024 17:27:38 -0700
Subject: [PATCH 54/57] [TensorRT EP] Update TRT OSS Parser to 10.2 (#21552)

### Description
<!-- Describe your changes. -->
Update TRT OSS Parser to [latest 10.2-GA
branch](https://github.com/onnx/onnx-tensorrt/commit/f161f95883b4ebd8cb789de5efc67b73c0a6e694)


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cgmanifests/generated/cgmanifest.json                         | 2 +-
 cmake/deps.txt                                                | 4 ++--
 .../github/azure-pipelines/templates/download-deps.yml        | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 66b305a6d36de..7de3f346f6386 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -216,7 +216,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d",
+          "commitHash": "f161f95883b4ebd8cb789de5efc67b73c0a6e694",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9d206b6bb3aeb..d0edf963451d5 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
-#use the latest commit of 10.0-GA
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b
+#use the latest commit of 10.2-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/f161f95883b4ebd8cb789de5efc67b73c0a6e694.zip;2148d0c79a171abf2b9451f3bfec164e85caf2ef
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index bf11730c2ce28..01965343c4592 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.167
+      version: 1.0.173
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.167
+      version: 1.0.173
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From 07d3be5b0e037927c3defd8a7e389e59ec748ad8 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Mon, 29 Jul 2024 21:04:47 -0700
Subject: [PATCH 55/57] CoreML: Add ML Program Split Op (#21456)

### Description

Add support for Split Op


### Motivation and Context
Address operator gaps in high priority model.

---------

Co-authored-by: Scott McKay <skottmckay@gmail.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../coreml/builders/impl/split_op_builder.cc  | 138 ++++++++++++------
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 2 files changed, 94 insertions(+), 45 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 0497357c45c54..dbd0f48576f8b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -5,6 +5,7 @@
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -24,6 +25,8 @@ class SplitOpBuilder : public BaseOpBuilder {
 
   // Split opset 13- uses "split" as attribute. Currently it's not supported.
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 13; }
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void SplitOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
@@ -43,55 +46,98 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   ORT_RETURN_IF_NOT(GetShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape.");
 
   NodeAttrHelper helper(node);
-  const auto axis = helper.Get("axis", 0);
+  int64_t axis = helper.Get("axis", 0);
 
-  // attribute introduced since opset 18
-  uint64_t num_outputs;
-
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-  auto* coreml_splitnd = layer->mutable_splitnd();
-  coreml_splitnd->set_axis(axis);
-
-  if (input_defs.size() > 1) {
-    // if "split" is explicitly provided as an input
-    const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
-    Initializer unpacked_tensor(split_tensor);
-    auto split_span = unpacked_tensor.DataAsSpan<uint64_t>();
-    auto split_sizes = split_span.size();
-    num_outputs = narrow<uint64_t>(split_sizes);
-    for (size_t i = 0; i < split_sizes; i++) {
-      coreml_splitnd->add_splitsizes(split_span[i]);
-    }
-  } else if (node.SinceVersion() < 18) {
-    num_outputs = narrow<uint64_t>(node.OutputDefs().size());
-    coreml_splitnd->set_numsplits(num_outputs);
-  } else {
-    // note: for opset 18+ 'num_outputs' is a required attribute
-    num_outputs = narrow<uint64_t>(helper.GetInt64("num_outputs").value());
+  auto calculate_remainder_and_chunk_size = [&](int32_t num_outputs) {
     // note: checked in IsOpSupportedImpl that ensures the dim value at splitting axis exists
     auto split_dim_size = data_shape[HandleNegativeAxis(axis, data_shape.size())];
-    uint64_t chunk_size = narrow<uint64_t>((split_dim_size + num_outputs - 1) / num_outputs);
+    uint64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs;
     uint64_t remainder = split_dim_size % chunk_size;
-    if (remainder) {
-      // uneven
-      auto split_sizes = InlinedVector<uint64_t>(num_outputs, chunk_size);
-      split_sizes.back() = remainder;
-      for (size_t i = 0; i < split_sizes.size(); i++) {
-        coreml_splitnd->add_splitsizes(split_sizes[i]);
-      }
+    return std::make_tuple(remainder, chunk_size);
+  };
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    std::unique_ptr<Operation> split_op = model_builder.CreateOperation(node, "split");
+    AddOperationInput(*split_op, "axis", model_builder.AddScalarConstant(split_op->type(), "axis", axis));
+
+    if (input_defs.size() > 1) {
+      // if "split" is explicitly provided as an input
+      Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name()));
+      auto split_span = unpacked_tensor.DataAsSpan<int64_t>();
+      AddOperationInput(*split_op, "split_sizes",
+                        model_builder.AddConstant(split_op->type(), "split_sizes", split_span));
+    } else if (node.SinceVersion() < 18) {
+      int64_t num_outputs = narrow<int64_t>(node.OutputDefs().size());
+      AddOperationInput(*split_op, "num_splits",
+                        model_builder.AddScalarConstant(split_op->type(), "num_splits", num_outputs));
     } else {
-      // even
+      // note: for opset 18+ 'num_outputs' is a required attribute
+      int64_t num_outputs = helper.GetInt64("num_outputs").value();
+      auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast<int32_t>(num_outputs));
+      if (remainder) {
+        // uneven
+        std::vector<int64_t> split_sizes(num_outputs, chunk_size);
+        split_sizes.back() = remainder;
+        AddOperationInput(*split_op, "split_sizes",
+                          model_builder.AddConstant(split_op->type(), "split_sizes", split_sizes));
+      } else {
+        // even
+        AddOperationInput(*split_op, "num_splits",
+                          model_builder.AddScalarConstant(split_op->type(), "num_splits", num_outputs));
+      }
+    }
+
+    AddOperationInput(*split_op, "x", input_defs[0]->Name());
+    for (const auto& output_def : node.OutputDefs()) {
+      AddOperationOutput(*split_op, *output_def);
+    }
+    model_builder.AddOperation(std::move(split_op));
+
+  } else
+#endif
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    auto* coreml_splitnd = layer->mutable_splitnd();
+    coreml_splitnd->set_axis(axis);
+
+    if (input_defs.size() > 1) {
+      // if "split" is explicitly provided as an input
+      // const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name());
+      Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name()));
+      auto split_span = unpacked_tensor.DataAsSpan<uint64_t>();
+      for (const auto& split_size : split_span) {
+        coreml_splitnd->add_splitsizes(split_size);
+      }
+    } else if (node.SinceVersion() < 18) {
+      uint64_t num_outputs = narrow<uint64_t>(node.OutputDefs().size());
       coreml_splitnd->set_numsplits(num_outputs);
+    } else {
+      // note: for opset 18+ 'num_outputs' is a required attribute
+      uint64_t num_outputs = narrow<uint64_t>(helper.GetInt64("num_outputs").value());
+      auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast<int32_t>(num_outputs));
+      if (remainder) {
+        // uneven
+        auto split_sizes = InlinedVector<uint64_t>(num_outputs, chunk_size);
+        split_sizes.back() = remainder;
+        for (size_t i = 0; i < split_sizes.size(); i++) {
+          coreml_splitnd->add_splitsizes(split_sizes[i]);
+        }
+      } else {
+        // even
+        coreml_splitnd->set_numsplits(num_outputs);
+      }
     }
-  }
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  // variadic number of outputs. Calculated based on the length of the given splitSizes if provided.
-  // Otherwise, uses attribute value 'num_outputs'.
-  for (uint64_t i = 0; i < num_outputs; i++) {
-    *layer->mutable_output()->Add() = node.OutputDefs()[i]->Name();
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    // variadic number of outputs. Calculated based on the length of the given splitSizes if provided.
+    // Otherwise, uses attribute value 'num_outputs'.
+    for (const auto& output_def : node.OutputDefs()) {
+      *layer->mutable_output()->Add() = output_def->Name();
+    }
+    model_builder.AddLayer(std::move(layer));
   }
-  model_builder.AddLayer(std::move(layer));
 
   return Status::OK();
 }
@@ -99,7 +145,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
 
   NodeAttrHelper helper(node);
   const auto axis = helper.Get("axis", 0);
@@ -110,16 +155,19 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
 
   const auto split_dims_at_axis = input_shape[HandleNegativeAxis(axis, input_shape.size())];
   if (input_defs.size() > 1 && input_defs[1]->Exists()) {
-    if (!CheckIsConstantInitializer(*input_defs[1], input_params.graph_viewer, logger, "'split'")) {
+    const auto* splits_tensor = input_params.graph_viewer.GetConstantInitializer(input_defs[1]->Name());
+    if (!splits_tensor) {
+      LOGS(logger, VERBOSE) << "CoreML 'splits' input must be a constant initializer.";
       return false;
     }
+
     const auto split_shape = *input_defs[1]->Shape();
     if (split_shape.dim_size() < 2) {
-      LOGS(logger, VERBOSE) << "CoreML SplitND requires to produce at least 2 outputs.";
+      LOGS(logger, VERBOSE) << "CoreML Split must produce at least 2 outputs.";
       return false;
     }
-    const auto& splits_tensor = *initializers.at(input_defs[1]->Name());
-    Initializer unpacked_tensor(splits_tensor);
+
+    Initializer unpacked_tensor(*splits_tensor);
     auto splits_span = unpacked_tensor.DataAsSpan<int64_t>();
     int64_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), int64_t{0});
     if (sum_of_splits != split_dims_at_axis) {
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index d2a961f17bd6a..b546c266c131b 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -24,6 +24,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Reshape||
 |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
 |ai.onnx.Slice|starts/ends/axes/steps must be constant initializers.|
+|ai.onnx:Split||
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||

From 82036b04978b7930185996a70d2146c2895469ea Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 29 Jul 2024 21:59:16 -0700
Subject: [PATCH 56/57] Remove references to the outdated CUDA EP factory
 method (#21549)

The function "OrtSessionOptionsAppendExecutionProvider_CUDA" is
deprecated.
---
 .../global_thread_pools/test_inference.cc     |  4 +++-
 onnxruntime/test/shared_lib/test_inference.cc | 20 ++++++++++++++-----
 .../test/shared_lib/test_model_loading.cc     |  5 +++--
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/global_thread_pools/test_inference.cc b/onnxruntime/test/global_thread_pools/test_inference.cc
index f553682975f11..c6d958536f488 100644
--- a/onnxruntime/test/global_thread_pools/test_inference.cc
+++ b/onnxruntime/test/global_thread_pools/test_inference.cc
@@ -74,7 +74,9 @@ static Ort::Session GetSessionObj(Ort::Env& env, T model_uri, int provider_type)
 
   if (provider_type == 1) {
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2* options;
+    Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+    session_options.AppendExecutionProvider_CUDA_V2(*options);
     std::cout << "Running simple inference with cuda provider" << std::endl;
 #else
     return Ort::Session(nullptr);
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 52491a179c2ce..7a33bf8a527cd 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -1959,7 +1959,9 @@ TEST(CApiTest, get_allocator_cpu) {
 #ifdef USE_CUDA
 TEST(CApiTest, get_allocator_cuda) {
   Ort::SessionOptions session_options;
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2* options;
+  Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+  session_options.AppendExecutionProvider_CUDA_V2(*options);
   Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options);
 
   Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault);
@@ -2076,7 +2078,9 @@ TEST(CApiTest, io_binding_cuda) {
 #ifdef USE_TENSORRT
   Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0));
 #else
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2* options;
+  Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+  session_options.AppendExecutionProvider_CUDA_V2(*options);
 #endif
   Ort::Session session(*ort_env, MODEL_URI, session_options);
 
@@ -3438,7 +3442,9 @@ TEST(CApiTest, AllocateInitializersFromNonArenaMemory) {
   Ort::SessionOptions session_options;
 
 #ifdef USE_CUDA
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2* options;
+  Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+  session_options.AppendExecutionProvider_CUDA_V2(*options);
 #else
   // arena is enabled but the sole initializer will still be allocated from non-arena memory
   Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1));
@@ -3890,7 +3896,9 @@ TEST(CApiTest, GitHubIssue10179) {
     try {
       const auto* model_path = MODEL_URI;
       Ort::SessionOptions session_options{};
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2* options;
+      Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+      session_options.AppendExecutionProvider_CUDA_V2(*options);
       Ort::Session session{*ort_env, model_path, session_options};
     } catch (const std::exception& e) {
       std::cerr << "exception: " << e.what() << "\n";
@@ -3920,7 +3928,9 @@ TEST(CApiTest, GitHubIssue10179) {
 TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) {
   const auto* model_path = SEQUENCE_MODEL_URI_2;
   Ort::SessionOptions session_options{};
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2* options;
+  Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+  session_options.AppendExecutionProvider_CUDA_V2(*options);
   Ort::Session session{*ort_env, model_path, session_options};
 
   Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
diff --git a/onnxruntime/test/shared_lib/test_model_loading.cc b/onnxruntime/test/shared_lib/test_model_loading.cc
index b7f6f7f4b9a77..5694398b9cb10 100644
--- a/onnxruntime/test/shared_lib/test_model_loading.cc
+++ b/onnxruntime/test/shared_lib/test_model_loading.cc
@@ -60,8 +60,9 @@ TEST(CApiTest, model_from_array) {
   create_session(so);
 
 #ifdef USE_CUDA
-  // test with CUDA provider when using onnxruntime as dll
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(so, 0));
+  OrtCUDAProviderOptionsV2* options;
+  Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options));
+  so.AppendExecutionProvider_CUDA_V2(*options);
   create_session(so);
 #endif
 }

From 530a2d7b41b0584f67ddfef6679a79e9dbeee556 Mon Sep 17 00:00:00 2001
From: Yi-Hong Lyu <yilyu@microsoft.com>
Date: Tue, 30 Jul 2024 03:49:14 -0700
Subject: [PATCH 57/57] Enable FP16 Clip and Handle Bias in FP16 Depthwise Conv
 (#21493)

- Improved accuracy for face-detection, image-classification, and
object-detection in the GeekBench ML benchmark on ARM64.
- Fixed issue https://github.com/microsoft/onnxruntime/issues/18992
---
 docs/OperatorKernels.md                       |   4 +-
 onnxruntime/core/mlas/inc/mlas.h              |   2 +
 onnxruntime/core/mlas/lib/dwconv.cpp          |  32 +--
 onnxruntime/core/mlas/lib/fp16_common.h       |  17 ++
 .../core/providers/cpu/fp16/fp16_conv.cc      |   4 +-
 onnxruntime/core/providers/cpu/math/clip.cc   |   2 +-
 .../test/providers/cpu/math/clip_test.cc      |  18 ++
 .../test/providers/cpu/nn/conv_fp16_test.cc   | 237 +++++++++++++++++-
 .../test/providers/cpu/nn/conv_op_test.cc     | 235 +++++++++++++++++
 9 files changed, 531 insertions(+), 20 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 211c53d0fecc8..f265c9f985070 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -58,8 +58,8 @@ Do not modify directly.*
 |Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
 |Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float)|
-|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11|**T** = tensor(float)|
 |||[6, 10]|**T** = tensor(float)|
 |Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(float)|
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 675f7c7a13e8c..e46105324a7fb 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac
  * @brief Indirect Depthwise convolution for fp16
  * @param Input         Supplies the indirect buffer for NHWC input
  * @param Filter        Supplies the address for filter tensor
+ * @param Bias          Supplies the address for 1D bias tensor B, has size of M
  * @param Output        Supplies the address for the result tensor
  * @param Channels      # of input channels
  * @param OutputCount   # of output pixels
@@ -1762,6 +1763,7 @@ MLASCALL
 MlasConvDepthwise(
     const MLAS_FP16* const* Input,
     const MLAS_FP16* Filter,
+    const MLAS_FP16* Bias,
     MLAS_FP16* Output,
     size_t Channels,
     size_t OutputCount,
diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp
index 15511d2d8ceac..d48d9cbb17502 100644
--- a/onnxruntime/core/mlas/lib/dwconv.cpp
+++ b/onnxruntime/core/mlas/lib/dwconv.cpp
@@ -14,7 +14,6 @@ Module Name:
 
 --*/
 
-
 #include "fp16_common.h"
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -24,19 +23,20 @@ void
 MlasConvDepthwiseKernel(
     const _mlas_fp16_* const* Input,
     const _mlas_fp16_* Filter,
+    const _mlas_fp16_* Bias,
     _mlas_fp16_* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     while (OutputCount > 0) {
         size_t ChannelOffset = 0;
         size_t c = Channels;
 
         while (c >= 8) {
-            MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
+            MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -54,7 +54,7 @@ MlasConvDepthwiseKernel(
         }
 
         if (c >= 4) {
-            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
         }
 
         if (c > 0) {
-            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            MLAS_FLOAT16X4 Accumulator =
+                Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -86,8 +87,7 @@ MlasConvDepthwiseKernel(
             Output += c;
         }
         if (PostProc) {
-            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
-                              Channels);
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
         }
         Input += KernelSize;
         OutputCount -= 1;
@@ -101,16 +101,17 @@ void
 MlasConvDepthwiseKernel(
     const _mlas_fp16_* const* Input,
     const _mlas_fp16_* Filter,
+    const _mlas_fp16_* Bias,
     _mlas_fp16_* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     while (OutputCount > 0) {
         for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
-            float Accumulator = 0.0f;
+            float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -120,35 +121,36 @@ MlasConvDepthwiseKernel(
             *Output++ = MLAS_Float2Half(Accumulator);
         }
         if (PostProc) {
-            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
-                              Channels);
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
         }
         Input += KernelSize;
         OutputCount -= 1;
     }
 }
 
-#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
-
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
 
 void
 MLASCALL
 MlasConvDepthwise(
     const MLAS_FP16* const* Input,
     const MLAS_FP16* Filter,
+    const MLAS_FP16* Bias,
     MLAS_FP16* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     MlasConvDepthwiseKernel(
         reinterpret_cast<const _mlas_fp16_* const*>(Input),
         reinterpret_cast<const _mlas_fp16_*>(Filter),
+        reinterpret_cast<const _mlas_fp16_*>(Bias),
         reinterpret_cast<_mlas_fp16_*>(Output),
         Channels,
         OutputCount,
         KernelSize,
-        PostProc);
+        PostProc
+    );
 }
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
index 1fcab870af64f..30b66cdb2ea78 100644
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -64,6 +64,23 @@ MLAS_FORCEINLINE
 MLAS_FLOAT16X4
 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }
 
+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
+{
+    MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
+    if ((len & 1) != 0) {
+        Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0));
+    }
+    if ((len & 2) != 0) {
+        Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
+        Vector = vreinterpret_f16_f32(
+            vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
+        );
+    }
+    return Vector;
+}
+
 MLAS_FORCEINLINE
 void
 MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
index e6867f10819ae..37db095e92570 100644
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
 
   bool share_prepacked_weights = (prepacked_weights != nullptr);
 
+  const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
   // Don't pack the filter buffer if the MlasConvDepthwise path is used.
-  if (!(group_input_channels == 1 && group_output_channels == 1)) {
+  if (!is_depthwise_conv) {
     packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
     if (packed_W_size_ != 0) {
       size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
@@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
         MlasConvDepthwise(
             worker_indirection_buffer,
             reordered_W,
+            Bdata,
             worker_output,
             static_cast<size_t>(M),
             static_cast<size_t>(output_count),
diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc
index ddb64a5a0e461..200469bc47835 100644
--- a/onnxruntime/core/providers/cpu/math/clip.cc
+++ b/onnxruntime/core/providers/cpu/math/clip.cc
@@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
     float);
 ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
     kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0,
-    float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
+    float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
 }  // namespace op_kernel_type_control
 
 using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
index 6f81bbbe31d54..9948a6cc8a681 100644
--- a/onnxruntime/test/providers/cpu/math/clip_test.cc
+++ b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(MathOpTest, Clip_MLFloat16) {
+  OpTester test("Clip", 12);
+
+  std::vector<int64_t> dims{3, 3};
+  test.AddInput<MLFloat16>("X", dims,
+                           {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f),
+                            MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+                            MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)});
+  test.AddInput<MLFloat16>("min", {}, {MLFloat16(0.0f)});
+  test.AddInput<MLFloat16>("max", {}, {MLFloat16(6.0f)});
+  test.AddOutput<MLFloat16>("Y", dims,
+                            {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f),
+                             MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+                             MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)});
+
+  test.Run();
+}
+
 TEST(MathOpTest, Clip_int32) {
   OpTester test("Clip", 12);
 
diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
index cb5fc8095982c..95b274966fbbb 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -714,6 +714,241 @@ TEST(ConvFp16Test, Conv2D_group) {
   TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
 }
 
+TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<MLFloat16> X = {MLFloat16(1.0f)};
+  vector<int64_t> X_shape = {1, 1, 1, 1};
+  vector<MLFloat16> W = {MLFloat16(0.5f)};
+  vector<int64_t> W_shape = {1, 1, 1, 1};
+  vector<MLFloat16> B = {MLFloat16(0.5f)};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 1, 1, 1};
+  auto expected_vals = {MLFloat16(1.0f)};
+
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvFp16Test, Depthwise2D_Bias_Group2) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      2,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<MLFloat16> X = {
+      MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(2.0f),
+      MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f),
+      MLFloat16(6.0f), MLFloat16(7.0f), MLFloat16(8.0f),
+
+      MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f),
+      MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f),
+      MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f)};
+  vector<int64_t> X_shape = {1, 2, 3, 3};
+  vector<MLFloat16> W = {MLFloat16(1.0f), MLFloat16(2.0f)};
+  vector<int64_t> W_shape = {2, 1, 1, 1};
+  vector<MLFloat16> B = {MLFloat16(1.0f), MLFloat16(-1.0f)};
+  vector<int64_t> B_shape = {2};
+  vector<int64_t> Y_shape = {1, 2, 3, 3};
+  auto expected_vals = {
+      MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f),
+      MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f),
+      MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f),
+
+      MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f),
+      MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f),
+      MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f)};
+
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvFp16Test, Depthwise2D_Bias_Group15) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      15,                           // group
+      vector<int64_t>{2, 2},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<MLFloat16> X = {
+      // C = 0
+      MLFloat16(0.0f), MLFloat16(1.0f),
+      MLFloat16(2.0f), MLFloat16(3.0f),
+
+      // C = 1
+      MLFloat16(4.0f), MLFloat16(5.0f),
+      MLFloat16(6.0f), MLFloat16(7.0f),
+
+      // C = 2
+      MLFloat16(8.0f), MLFloat16(9.0f),
+      MLFloat16(10.0f), MLFloat16(11.0f),
+
+      // C = 3
+      MLFloat16(12.0f), MLFloat16(13.0f),
+      MLFloat16(14.0f), MLFloat16(15.0f),
+
+      // C = 4
+      MLFloat16(16.0f), MLFloat16(17.0f),
+      MLFloat16(18.0f), MLFloat16(19.0f),
+
+      // C = 5
+      MLFloat16(20.0f), MLFloat16(21.0f),
+      MLFloat16(22.0f), MLFloat16(23.0f),
+
+      // C = 6
+      MLFloat16(24.0f), MLFloat16(25.0f),
+      MLFloat16(26.0f), MLFloat16(27.0f),
+
+      // C = 7
+      MLFloat16(28.0f), MLFloat16(29.0f),
+      MLFloat16(30.0f), MLFloat16(31.0f),
+
+      // C = 8
+      MLFloat16(32.0f), MLFloat16(33.0f),
+      MLFloat16(34.0f), MLFloat16(35.0f),
+
+      // C = 9
+      MLFloat16(36.0f), MLFloat16(37.0f),
+      MLFloat16(38.0f), MLFloat16(39.0f),
+
+      // C = 10
+      MLFloat16(40.0f), MLFloat16(41.0f),
+      MLFloat16(42.0f), MLFloat16(43.0f),
+
+      // C = 11
+      MLFloat16(44.0f), MLFloat16(45.0f),
+      MLFloat16(46.0f), MLFloat16(47.0f),
+
+      // C = 12
+      MLFloat16(48.0f), MLFloat16(49.0f),
+      MLFloat16(50.0f), MLFloat16(51.0f),
+
+      // C = 13
+      MLFloat16(52.0f), MLFloat16(53.0f),
+      MLFloat16(54.0f), MLFloat16(55.0f),
+
+      // C = 14
+      MLFloat16(56.0f), MLFloat16(57.0f),
+      MLFloat16(58.0f), MLFloat16(59.0f)};
+  vector<int64_t> X_shape = {1, 15, 2, 2};
+  vector<MLFloat16> W = {
+      // M = 0
+      MLFloat16(0.0f), MLFloat16(1.0f),
+      MLFloat16(2.0f), MLFloat16(3.0f),
+
+      // M = 1
+      MLFloat16(4.0f), MLFloat16(5.0f),
+      MLFloat16(6.0f), MLFloat16(7.0f),
+
+      // M = 2
+      MLFloat16(8.0f), MLFloat16(9.0f),
+      MLFloat16(10.0f), MLFloat16(11.0f),
+
+      // M = 3
+      MLFloat16(12.0f), MLFloat16(13.0f),
+      MLFloat16(14.0f), MLFloat16(15.0f),
+
+      // M = 4
+      MLFloat16(16.0f), MLFloat16(17.0f),
+      MLFloat16(18.0f), MLFloat16(19.0f),
+
+      // M = 5
+      MLFloat16(20.0f), MLFloat16(21.0f),
+      MLFloat16(22.0f), MLFloat16(23.0f),
+
+      // M = 6
+      MLFloat16(24.0f), MLFloat16(25.0f),
+      MLFloat16(26.0f), MLFloat16(27.0f),
+
+      // M = 7
+      MLFloat16(28.0f), MLFloat16(29.0f),
+      MLFloat16(30.0f), MLFloat16(31.0f),
+
+      // M = 8
+      MLFloat16(32.0f), MLFloat16(33.0f),
+      MLFloat16(34.0f), MLFloat16(35.0f),
+
+      // M = 9
+      MLFloat16(36.0f), MLFloat16(37.0f),
+      MLFloat16(38.0f), MLFloat16(39.0f),
+
+      // M = 10
+      MLFloat16(40.0f), MLFloat16(41.0f),
+      MLFloat16(42.0f), MLFloat16(43.0f),
+
+      // M = 11
+      MLFloat16(44.0f), MLFloat16(45.0f),
+      MLFloat16(46.0f), MLFloat16(47.0f),
+
+      // M = 12
+      MLFloat16(48.0f), MLFloat16(49.0f),
+      MLFloat16(50.0f), MLFloat16(51.0f),
+
+      // M = 13
+      MLFloat16(52.0f), MLFloat16(53.0f),
+      MLFloat16(54.0f), MLFloat16(55.0f),
+
+      // M = 14
+      MLFloat16(56.0f), MLFloat16(57.0f),
+      MLFloat16(58.0f), MLFloat16(59.0f)};
+  vector<int64_t> W_shape = {15, 1, 2, 2};
+  vector<MLFloat16> B = {
+      MLFloat16(101.0f),
+      MLFloat16(102.0f),
+      MLFloat16(103.0f),
+      MLFloat16(104.0f),
+      MLFloat16(105.0f),
+      MLFloat16(106.0f),
+      MLFloat16(107.0f),
+      MLFloat16(108.0f),
+      MLFloat16(109.0f),
+      MLFloat16(110.0f),
+      MLFloat16(111.0f),
+      MLFloat16(112.0f),
+      MLFloat16(113.0f),
+      MLFloat16(114.0f),
+      MLFloat16(115.0f)};
+  vector<int64_t> B_shape = {15};
+  vector<int64_t> Y_shape = {1, 15, 1, 1};
+  auto expected_vals = {
+      MLFloat16(115.0f),  // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0
+      MLFloat16(228.0f),
+      MLFloat16(469.0f),
+      MLFloat16(838.0f),
+      MLFloat16(1335.0f),
+      MLFloat16(1960.0f),
+      MLFloat16(2713.0f),  // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0
+      MLFloat16(3594.0f),
+      MLFloat16(4603.0f),
+      MLFloat16(5740.0f),
+      MLFloat16(7005.0f),
+      MLFloat16(8398.0f),
+      MLFloat16(9919.0f),   // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0
+      MLFloat16(11568.0f),  // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0
+      MLFloat16(13345.0f)   // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0
+  };
+
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
 TEST(ConvFp16Test, ConvDimWithZero) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
@@ -1074,4 +1309,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) {
 }  // namespace test
 }  // namespace onnxruntime
 
-#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
\ No newline at end of file
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 0efa78af2795c..2d885ee9d479f 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -647,6 +647,241 @@ TEST(ConvTest, Conv2D_group) {
   TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
 }
 
+TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {1.0f};
+  vector<int64_t> X_shape = {1, 1, 1, 1};
+  vector<float> W = {0.5f};
+  vector<int64_t> W_shape = {1, 1, 1, 1};
+  vector<float> B = {0.5f};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 1, 1, 1};
+  auto expected_vals = {1.0f};
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group2) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      2,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {
+      0.0f, 1.0f, 2.0f,
+      3.0f, 4.0f, 5.0f,
+      6.0f, 7.0f, 8.0f,
+
+      9.0f, 10.0f, 11.0f,
+      12.0f, 13.0f, 14.0f,
+      15.0f, 16.0f, 17.0f};
+  vector<int64_t> X_shape = {1, 2, 3, 3};
+  vector<float> W = {1.0f, 2.0f};
+  vector<int64_t> W_shape = {2, 1, 1, 1};
+  vector<float> B = {1.0f, -1.0f};
+  vector<int64_t> B_shape = {2};
+  vector<int64_t> Y_shape = {1, 2, 3, 3};
+  auto expected_vals = {
+      1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f,
+      7.0f, 8.0f, 9.0f,
+
+      17.0f, 19.0f, 21.0f,
+      23.0f, 25.0f, 27.0f,
+      29.0f, 31.0f, 33.0f};
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Depthwise2D_Bias_Group15) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      15,                           // group
+      vector<int64_t>{2, 2},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {
+      // C = 0
+      0.0f, 1.0f,
+      2.0f, 3.0f,
+
+      // C = 1
+      4.0f, 5.0f,
+      6.0f, 7.0f,
+
+      // C = 2
+      8.0f, 9.0f,
+      10.0f, 11.0f,
+
+      // C = 3
+      12.0f, 13.0f,
+      14.0f, 15.0f,
+
+      // C = 4
+      16.0f, 17.0f,
+      18.0f, 19.0f,
+
+      // C = 5
+      20.0f, 21.0f,
+      22.0f, 23.0f,
+
+      // C = 6
+      24.0f, 25.0f,
+      26.0f, 27.0f,
+
+      // C = 7
+      28.0f, 29.0f,
+      30.0f, 31.0f,
+
+      // C = 8
+      32.0f, 33.0f,
+      34.0f, 35.0f,
+
+      // C = 9
+      36.0f, 37.0f,
+      38.0f, 39.0f,
+
+      // C = 10
+      40.0f, 41.0f,
+      42.0f, 43.0f,
+
+      // C = 11
+      44.0f, 45.0f,
+      46.0f, 47.0f,
+
+      // C = 12
+      48.0f, 49.0f,
+      50.0f, 51.0f,
+
+      // C = 13
+      52.0f, 53.0f,
+      54.0f, 55.0f,
+
+      // C = 14
+      56.0f, 57.0f,
+      58.0f, 59.0f};
+  vector<int64_t> X_shape = {1, 15, 2, 2};
+  vector<float> W = {
+      // M = 0
+      0.0f, 1.0f,
+      2.0f, 3.0f,
+
+      // M = 1
+      4.0f, 5.0f,
+      6.0f, 7.0f,
+
+      // M = 2
+      8.0f, 9.0f,
+      10.0f, 11.0f,
+
+      // M = 3
+      12.0f, 13.0f,
+      14.0f, 15.0f,
+
+      // M = 4
+      16.0f, 17.0f,
+      18.0f, 19.0f,
+
+      // M = 5
+      20.0f, 21.0f,
+      22.0f, 23.0f,
+
+      // M = 6
+      24.0f, 25.0f,
+      26.0f, 27.0f,
+
+      // M = 7
+      28.0f, 29.0f,
+      30.0f, 31.0f,
+
+      // M = 8
+      32.0f, 33.0f,
+      34.0f, 35.0f,
+
+      // M = 9
+      36.0f, 37.0f,
+      38.0f, 39.0f,
+
+      // M = 10
+      40.0f, 41.0f,
+      42.0f, 43.0f,
+
+      // M = 11
+      44.0f, 45.0f,
+      46.0f, 47.0f,
+
+      // M = 12
+      48.0f, 49.0f,
+      50.0f, 51.0f,
+
+      // M = 13
+      52.0f, 53.0f,
+      54.0f, 55.0f,
+
+      // M = 14
+      56.0f, 57.0f,
+      58.0f, 59.0f};
+  vector<int64_t> W_shape = {15, 1, 2, 2};
+  vector<float> B = {
+      101.0f,
+      102.0f,
+      103.0f,
+      104.0f,
+      105.0f,
+      106.0f,
+      107.0f,
+      108.0f,
+      109.0f,
+      110.0f,
+      111.0f,
+      112.0f,
+      113.0f,
+      114.0f,
+      115.0f};
+  vector<int64_t> B_shape = {15};
+  vector<int64_t> Y_shape = {1, 15, 1, 1};
+  auto expected_vals = {
+      115.0f,  // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0
+      228.0f,
+      469.0f,
+      838.0f,
+      1335.0f,
+      1960.0f,
+      2713.0f,  // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0
+      3594.0f,
+      4603.0f,
+      5740.0f,
+      7005.0f,
+      8398.0f,
+      9919.0f,   // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0
+      11568.0f,  // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0
+      13345.0f   // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0
+  };
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
 TEST(ConvTest, ConvDimWithZero) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad