From 9c2b85ad5890030b154d832ff17b620f32a0ffc4 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 15 Jul 2024 12:29:02 -0700
Subject: [PATCH 01/35] Fix Android build on Windows (#21304)

- Pass a list of files instead of path separator-delimited string to project.files(). See this issue: https://github.com/gradle/gradle/issues/19817
- Check for host (instead of target) being Windows when using fallback patch program.
---
 cmake/CMakeLists.txt      | 2 +-
 java/build-android.gradle | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index e2fc3da9de35e..fb9f2f8f32b34 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -551,7 +551,7 @@ if(NOT WIN32 AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android")
 endif()
 
 find_package(Patch)
-if (WIN32 AND NOT Patch_FOUND)
+if (CMAKE_HOST_WIN32 AND NOT Patch_FOUND)
     # work around CI machines missing patch from the git install by falling back to the binary in this repo.
     # replicate what happens in https://github.com/Kitware/CMake/blob/master/Modules/FindPatch.cmake but without
     # the hardcoded suffixes in the path to the patch binary.
diff --git a/java/build-android.gradle b/java/build-android.gradle
index afbad9f03d08d..fd22fa27e8db9 100644
--- a/java/build-android.gradle
+++ b/java/build-android.gradle
@@ -105,7 +105,7 @@ task sourcesJar(type: Jar) {
 
 task javadoc(type: Javadoc) {
 	source = android.sourceSets.main.java.srcDirs
-	classpath += project.files(android.getBootClasspath().join(File.pathSeparator))
+	classpath += project.files(android.getBootClasspath())
 }
 
 task javadocJar(type: Jar, dependsOn: javadoc) {

From e5f18ba2c14ced91e5f483fde0a7ef4b3b04abbe Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 15 Jul 2024 14:21:34 -0700
Subject: [PATCH 02/35] Change libonnxruntime.so's SONAME: remove the minor and
 patch version. (#21339)

### Description
Resolve #21281 and #10589 .

1. Change libonnxruntime.so's SONAME: remove the minor and patch
version.

By default when creating an ELF shared object, linker will set the
file's internal DT_SONAME field to the specified name which is the file
name plus SOVERSION . For example, the file name for our library is
libonnxruntime.so. And by default SOVERSION is the lib's VERSION number,
which is something like 1.19.0. So the DT_SONAME field in
libonnxruntime.so is something like libonnxruntime.so.1.18.0. You can
use readelf tool to examine it.

```
readelf -d libonnxruntime.so | grep SONAME
 0x000000000000000e (SONAME)             Library soname: [libonnxruntime.so.1.18.0]
```

When an executable is linked with a shared object which has a DT_SONAME
field, then when the executable is run the dynamic linker will attempt
to load the shared object specified by the DT_SONAME field rather than
using the file name(which is libonnxruntime.so) given to the linker.

After this change, the SONAME will be shorten to "libonnxruntime.so.1"
instead.

2. Set default version strings for Windows DLLs, to resolve #10589
---
 cmake/CMakeLists.txt                       | 22 ++++++++++++++++
 cmake/onnxruntime.cmake                    | 30 ++++++++++++----------
 cmake/onnxruntime_providers_cpu.cmake      |  5 ----
 cmake/onnxruntime_providers_openvino.cmake |  5 ----
 cmake/winml.cmake                          |  5 ----
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index fb9f2f8f32b34..5db0466497ad1 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1140,6 +1140,13 @@ endfunction()
 function(onnxruntime_add_shared_library target_name)
   add_library(${target_name} SHARED ${ARGN})
   onnxruntime_configure_target(${target_name})
+  if(WIN32)
+        target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\")
+  endif()
 endfunction()
 
 function(onnxruntime_add_static_library target_name)
@@ -1154,6 +1161,13 @@ function(onnxruntime_add_shared_library_module target_name)
   else()
     #On Windows, this target shouldn't generate an import lib, but I don't know how to disable it.
     add_library(${target_name} MODULE ${ARGN})
+    if(WIN32)
+        target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\")
+    endif()
   endif()
 
   onnxruntime_configure_target(${target_name})
@@ -1636,6 +1650,14 @@ set(VERSION_MINOR_PART   0 CACHE STRING "Second part of numeric file/product ver
 set(VERSION_BUILD_PART       0 CACHE STRING "Third part of numeric file/product version.")
 set(VERSION_PRIVATE_PART     0 CACHE STRING "Fourth part of numeric file/product version.")
 set(VERSION_STRING       "Internal Build" CACHE STRING "String representation of file/product version.")
+if(VERSION_MAJOR_PART STREQUAL "0" AND VERSION_MINOR_PART STREQUAL "0" AND VERSION_BUILD_PART STREQUAL "0" AND VERSION_PRIVATE_PART STREQUAL "0")
+    string(REPLACE "." ";"  ORT_VERSION_STRING_LIST ${ORT_VERSION})
+    list(GET ORT_VERSION_STRING_LIST 0 VERSION_MAJOR_PART)
+    list(GET ORT_VERSION_STRING_LIST 1 VERSION_MINOR_PART)
+    list(GET ORT_VERSION_STRING_LIST 2 VERSION_BUILD_PART)
+    set(VERSION_STRING ORT_VERSION)
+endif()
+
 
 if (WIN32)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SYS_PATH_LIB})
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index ec98047750a91..aebf9b53f8f05 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -95,7 +95,6 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
     FRAMEWORK TRUE
     FRAMEWORK_VERSION A
     MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
-    SOVERSION ${ORT_VERSION}
     # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
   )
 else()
@@ -108,11 +107,7 @@ endif()
 add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
 target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
-target_compile_definitions(onnxruntime PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_STRING=\"${VERSION_STRING}\")
+
 target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
 
 if(UNIX)
@@ -130,7 +125,6 @@ if (NOT WIN32)
     set(ONNXRUNTIME_SO_LINK_FLAG " -Wl,-exported_symbols_list,${SYMBOL_FILE}")
     if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
       set_target_properties(onnxruntime PROPERTIES
-        SOVERSION ${ORT_VERSION}
         MACOSX_RPATH TRUE
         INSTALL_RPATH_USE_LINK_PATH FALSE
         BUILD_WITH_INSTALL_NAME_DIR TRUE
@@ -222,13 +216,23 @@ target_link_libraries(onnxruntime PRIVATE
 )
 
 set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
-set_target_properties(onnxruntime PROPERTIES
-  PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
-  LINK_DEPENDS ${SYMBOL_FILE}
-  VERSION ${ORT_VERSION}
-  FOLDER "ONNXRuntime"
-)
 
+#See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html
+if(NOT APPLE AND NOT WIN32)
+  set_target_properties(onnxruntime PROPERTIES
+    PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+    LINK_DEPENDS ${SYMBOL_FILE}
+    VERSION ${ORT_VERSION}
+    SOVERSION 1
+    FOLDER "ONNXRuntime")
+else()
+  # Omit the SOVERSION setting in Windows/macOS/iOS/.. build
+  set_target_properties(onnxruntime PROPERTIES
+    PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+    LINK_DEPENDS ${SYMBOL_FILE}
+    VERSION ${ORT_VERSION}
+    FOLDER "ONNXRuntime")
+endif()
 install(TARGETS onnxruntime
         EXPORT ${PROJECT_NAME}Targets
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index b211c02f712bd..d9fba6f564037 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -236,11 +236,6 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_target_properties(onnxruntime_providers_shared PROPERTIES FOLDER "ONNXRuntime")
   set_target_properties(onnxruntime_providers_shared PROPERTIES LINKER_LANGUAGE CXX)
 
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_STRING=\"${VERSION_STRING}\")
   target_compile_definitions(onnxruntime_providers_shared PRIVATE FILE_NAME=\"onnxruntime_providers_shared.dll\")
 
 
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 5876b2b5c448b..d738e29101cfe 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -45,11 +45,6 @@
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
   target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS})
 
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_STRING=\"${VERSION_STRING}\")
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 
   if(MSVC)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index d74250b962628..ff6b71217ad87 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -718,11 +718,6 @@ target_compile_definitions(winml_dll PRIVATE ONNX_ML)
 target_compile_definitions(winml_dll PRIVATE LOTUS_LOG_THRESHOLD=2)
 target_compile_definitions(winml_dll PRIVATE LOTUS_ENABLE_STDERR_LOGGING)
 target_compile_definitions(winml_dll PRIVATE PLATFORM_WINDOWS)
-target_compile_definitions(winml_dll PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-target_compile_definitions(winml_dll PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-target_compile_definitions(winml_dll PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-target_compile_definitions(winml_dll PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-target_compile_definitions(winml_dll PRIVATE VER_STRING=\"${VERSION_STRING}\")
 target_compile_definitions(winml_dll PRIVATE BINARY_NAME=\"${BINARY_NAME}\")
 
 if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")

From db9ee359637c5560640688e8476f31bf2c11f1f4 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Mon, 15 Jul 2024 14:30:02 -0700
Subject: [PATCH 03/35] [TensorRT EP] c4996 suppression to build with trt10.2ga
 on Windows (#21358)

### Description
<!-- Describe your changes. -->
Supress C4996 deprecated api warning as errors as a walkaround to build
ORT with TRT10.2GA on Windows


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Four apis were recently declared as deprecated, which are being used by
core code of TRT EP.
Temporally suppress deprecated api warnings before updating these apis
---
 .../tensorrt/tensorrt_execution_provider.cc   | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 8a601c156bd0a..67cbc8f5d6f13 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -70,7 +70,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
     const std::string tensor_name = network.getInput(i)->getName();
     auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
     if (dynamic_range_iter != dynamic_range_map.end()) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       if (!network.getInput(i)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
         LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for network input " << tensor_name;
         return false;
       }
@@ -84,7 +91,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
       const std::string tensor_name = trt_layer->getOutput(j)->getName();
       auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
       if (dynamic_range_iter != dynamic_range_map.end()) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         if (!trt_layer->getOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for tensor " << tensor_name;
           return false;
         }
@@ -122,7 +136,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
           }
           max_weight = std::max(max_weight, std::abs(weight));
         }
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         if (!trt_layer->getOutput(j)->setDynamicRange(static_cast<float>(-max_weight), static_cast<float>(max_weight))) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for layer " << const_layer_name;
           return false;
         }
@@ -2232,7 +2253,14 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
         auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
 
         auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         trt_parser->supportsModel(string_buf.data(), string_buf.size(), parser_nodes_list, model_path_);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
         SubGraphCollection_t next_nodes_list;
         const std::vector<NodeIndex>& subgraph_node_index = graph_viewer->GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
@@ -3074,7 +3102,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       } else {
         // Set INT8 per tensor dynamic range
         if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
           trt_config->setInt8Calibrator(nullptr);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                    "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
@@ -3193,7 +3228,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Note: Creating an execution context from an engine is thread safe per TRT doc
     // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
     if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > max_ctx_mem_size_) {
         max_ctx_mem_size_ = mem_size;
       }
@@ -3466,7 +3508,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
       // Set INT8 Per Tensor Dynamic range
       if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         trt_config->setInt8Calibrator(nullptr);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
         if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
         }
@@ -3734,7 +3783,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
     // Set execution context memory
     if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > *max_context_mem_size_ptr) {
         *max_context_mem_size_ptr = mem_size;
       }
@@ -3865,7 +3921,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
   // Note: Creating an execution context from an engine is thread safe per TRT doc
   // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
   if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
     size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
     if (mem_size > max_ctx_mem_size_) {
       max_ctx_mem_size_ = mem_size;
     }
@@ -4038,7 +4101,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
     // Set execution context memory
     if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > *max_context_mem_size_ptr) {
         *max_context_mem_size_ptr = mem_size;
       }

From c03e6fff4c2d9d54aca41a5061bd79c97cd014f5 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 15 Jul 2024 14:44:03 -0700
Subject: [PATCH 04/35] Combining android build and test step into one job
 (#21340)

### Description
Combining android build and test step into one job


### Motivation and Context
Reduce runtime by removing additional machine allocation, and artifact
uploading and downloading.

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 ...ndroid-x86_64-crosscompile-ci-pipeline.yml | 321 +++---------------
 .../templates/use-android-emulator.yml        |  19 ++
 2 files changed, 68 insertions(+), 272 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 72f236ec2e6cc..10d9a9a24d88a 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -48,12 +48,12 @@ parameters:
 stages:
 # Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate
 # issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code.
-- stage: BUILD_CPU_STAGE
+- stage: BUILD_AND_TEST_CPU
   dependsOn: []
   variables:
     Codeql.Enabled: false
   jobs:
-  - job: Build_CPU_EP
+  - job: BUILD_AND_TEST_CPU
     pool: onnxruntime-Ubuntu2204-AMD-CPU
     workspace:
       clean: all
@@ -78,12 +78,14 @@ stages:
     - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
       displayName: Install coreutils and ninja
 
-    - template: "templates/use-android-ndk.yml"
-
+    - template: templates/use-android-ndk.yml
+    - template: templates/use-android-emulator.yml
+      parameters:
+        create: true
+        start: true
     - script: |
         env | grep ANDROID
       displayName: View Android ENVs
-
     - script: |
         python3 tools/ci_build/build.py \
           --enable_lto \
@@ -96,42 +98,17 @@ stages:
           --skip_submodule_sync \
           --parallel \
           --cmake_generator=Ninja \
-          --build_java \
-          --skip_tests
-      displayName: CPU EP, Build
-
-    - task: CopyFiles@2
-      displayName: Copy apks
-      inputs:
-        contents: 'build/**/*.apk'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test data
-      inputs:
-        contents: 'build/**/testdata/**'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test executables
-      inputs:
-        contents: |
-          build/Debug/*
-          build/Debug/java/androidtest/android/**
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathToPublish: $(Build.ArtifactStagingDirectory)
-        artifactName: CPUBuildOutput
+          --build_java
+      displayName: CPU EP, Build and Test
+    - template: templates/use-android-emulator.yml
+      parameters:
+        stop: true
 
     - template: templates/clean-agent-build-directory-step.yml
 
-- stage: BUILD_NNAPI_STAGE
+- stage: BUILD_AND_TEST_NNAPI_EP
   dependsOn: []
+  condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
   variables:
     Codeql.ProjectConfigPath: .github/workflows
     Codeql.Enabled: true
@@ -140,14 +117,12 @@ stages:
       JobsTimeout: 120
     ${{ else }}:
       JobsTimeout: 60
-
   jobs:
-  - job: Build_NNAPI_EP
+  - job: BUILD_AND_TEST_NNAPI_EP
     pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: ${{ variables.JobsTimeout }}
     workspace:
       clean: all
-    condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
     steps:
     - task: UsePythonVersion@0
       displayName: Use Python $(pythonVersion)
@@ -163,8 +138,10 @@ stages:
 
     - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
       displayName: Install coreutils and ninja
-
-    - template: "templates/use-android-ndk.yml"
+    - template: templates/use-android-emulator.yml
+      parameters:
+        create: true
+        start: true
 
     - script: |
         env | grep ANDROID
@@ -172,194 +149,31 @@ stages:
 
     - script: |
         python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --skip_submodule_sync \
-          --parallel \
-          --use_nnapi \
-          --cmake_generator=Ninja \
-          --build_java \
-          --skip_tests
-      displayName: NNAPI EP, Build
-
-    - task: CopyFiles@2
-      displayName: Copy apks
-      inputs:
-        contents: 'build_nnapi/**/*.apk'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test data
-      inputs:
-        contents: 'build_nnapi/**/testdata/**'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy Test Executables
-      inputs:
-        contents: |
-          build_nnapi/Debug/*
-          build_nnapi/Debug/java/androidtest/android/**
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathToPublish: $(Build.ArtifactStagingDirectory)
-        artifactName: NNAPIBuildOutput
+        --enable_lto \
+        --android \
+        --build_dir build_nnapi \
+        --android_sdk_path $ANDROID_HOME \
+        --android_ndk_path $ANDROID_NDK_HOME \
+        --android_abi=x86_64 \
+        --android_api=29 \
+        --skip_submodule_sync \
+        --parallel \
+        --use_nnapi \
+        --build_shared_lib \
+        --cmake_generator=Ninja \
+        --build_java
+      displayName: NNAPI EP, Build, Test on Android Emulator
+
+    - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
+      # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
+      displayName: Build Minimal ORT with NNAPI and run tests
+
+    - template: templates/use-android-emulator.yml
+      parameters:
+        stop: true
 
     - template: templates/clean-agent-build-directory-step.yml
 
-- stage: TEST_STAGE
-  dependsOn: [BUILD_CPU_STAGE, BUILD_NNAPI_STAGE]
-  jobs:
-  - job: Test_CPU_EP
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
-    workspace:
-      clean: all
-    condition: succeeded()
-    steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
-      - task: DownloadPipelineArtifact@2
-        inputs:
-          ${{ if eq(parameters.specificArtifact, true) }}:
-            source: 'specific'
-            project: 'onnxruntime'
-            pipeline: $(Build.DefinitionName)
-            runVersion: 'specific'
-            runId: ${{ parameters.runId }}
-          ${{ if ne(parameters.specificArtifact, true) }}:
-            source: 'current'
-          artifact: 'CPUBuildOutput'
-          path: $(Build.SourcesDirectory)
-
-      - task: UsePythonVersion@0
-        displayName: Use Python $(pythonVersion)
-        inputs:
-          versionSpec: $(pythonVersion)
-
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 11
-        inputs:
-          versionSpec: '11'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: "templates/use-android-ndk.yml"
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=30 \
-          --build_java \
-          --test
-        displayName: CPU EP, Test on Android Emulator
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
-  - job: Test_NNAPI_EP
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
-    timeoutInMinutes: 90
-    workspace:
-      clean: all
-    condition: and(succeeded(), notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
-    steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
-      - task: DownloadPipelineArtifact@2
-        inputs:
-          ${{ if eq(parameters.specificArtifact, true) }}:
-            source: 'specific'
-            project: 'onnxruntime'
-            pipeline: $(Build.DefinitionName)
-            runVersion: 'specific'
-            runId: ${{ parameters.runId }}
-          ${{ if ne(parameters.specificArtifact, true) }}:
-            source: 'current'
-          artifact: 'NNAPIBuildOutput'
-          path: $(Build.SourcesDirectory)
-
-      - task: UsePythonVersion@0
-        displayName: Use Python $(pythonVersion)
-        inputs:
-          versionSpec: $(pythonVersion)
-
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 11
-        inputs:
-          versionSpec: '11'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: "templates/use-android-ndk.yml"
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --build_java \
-          --use_nnapi \
-          --test
-        displayName: NNAPI EP, Test, CodeCoverage on Android Emulator
-
-      # used by Build Minimal ORT
-      - script: brew install coreutils ninja
-        displayName: Install coreutils and ninja
-
-      - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-        # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-        displayName: Build Minimal ORT with NNAPI and run tests
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
 - stage: MASTER_BUILD_STAGE
   # The below jobs only run on master build.
   # because coverage report is hard to support in cross machines.
@@ -368,20 +182,12 @@ stages:
   condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
   jobs:
   - job: NNAPI_EP_MASTER
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: 180
     workspace:
       clean: all
     condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
     steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
       - task: UsePythonVersion@0
         displayName: Use Python $(pythonVersion)
         inputs:
@@ -394,11 +200,7 @@ stages:
           jdkArchitectureOption: 'x64'
           jdkSourceOption: 'PreInstalled'
 
-      - template: "templates/use-android-ndk.yml"
-
-      # used by Build Minimal ORT
-      - script: brew install coreutils ninja
-        displayName: Install coreutils and ninja
+      - template: templates/use-android-ndk.yml
 
       - template: templates/use-android-emulator.yml
         parameters:
@@ -429,50 +231,25 @@ stages:
             --build_dir build_nnapi \
             --android_sdk_path $ANDROID_HOME
         displayName: Retrieve runtime code coverage files from the emulator and analyze
+
       - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
         displayName: Print coverage report
 
-      - task: PublishPipelineArtifact@0
-        displayName: 'Publish code coverage report'
-        inputs:
-            artifactName: "coverage_rpt.txt"
-            targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
-            publishLocation: 'pipeline'
-
       - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
         # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
         displayName: Build Minimal ORT with NNAPI and run tests
 
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
-  - job: Update_Dashboard
-    workspace:
-      clean: all
-    variables:
-    - name: skipComponentGovernanceDetection
-      value: true
-    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-    condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
-    dependsOn:
-    - NNAPI_EP_MASTER
-    steps:
-      - task: DownloadPipelineArtifact@0
-        displayName: 'Download code coverage report'
-        inputs:
-          artifactName: 'coverage_rpt.txt'
-          targetPath: '$(Build.BinariesDirectory)'
-
       - task: AzureCLI@2
         displayName: 'Post Android Code Coverage To DashBoard'
         inputs:
           azureSubscription: AIInfraBuild
           scriptType: bash
           scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
-          arguments: '"$(Build.BinariesDirectory)/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
+          arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
           workingDirectory: '$(Build.BinariesDirectory)'
 
+      - template: templates/use-android-emulator.yml
+        parameters:
+          stop: true
+
       - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
index b31882c8da18f..4251a8401f8f0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
@@ -15,6 +15,25 @@ parameters:
 
 steps:
 - ${{ if eq(parameters.create, true) }}:
+  - script: |
+      if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then
+          echo "${ANDROID_SDK_ROOT}/emulator is in PATH"
+      else
+          ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator"
+          echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/emulator"
+      fi
+    displayName: Check if emulator are installed and add to PATH
+
+  - script: |
+      if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then
+          echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH"
+      else
+          ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools"
+          echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/platform-tools"
+      fi
+      ls -R ${ANDROID_SDK_ROOT}/platform-tools
+    displayName: Check if platform tools are installed and add to PATH
+
   - script: |
       set -e -x
       python3 tools/python/run_android_emulator.py \

From 50170c697e5261d7f517368fa8ef3c4f3205227b Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:25:40 -0700
Subject: [PATCH 05/35] [Optimizer] DQ + MatMul to MatMulNBits support: kernel
 changes (#21342)

Description: ### Description
This is a partial change ported from fajin/qdqmatmulnbitstoolchain. That
branch has issues resolving the web CI.

MatMulNBits is a heavily optimized matmul operation. Currently a MatMul
can be converted to MatMulNBits to speed up the model inference.
However, MatMulNBits is an ORT only op. To make the graph compatible
with ONNX ops and utilize MatMulNBits at the same time, we introduce
Q/DQ support for MatMulNBits.

To convert MatMul ops in a model to MatMulNBits:
1. use matmul_4bits_quantizer.py to convert MatMul to DQ + MatMul using
QDQ mode.
2. In ORT session, DQ + MatMul is fused to MatMulNBits

#### Note
MatMulNBits assume B weight is uint4. When no zp is provided, zp
defaults to 8, which is different from DQ. DQ defaults zp to 0 when no
zp provided. And DQ supports int4. Therefore some conversions are
introduced during DQ + MatMul --> MatMulNBits step.

#### Perf
Using QDQ format will increase the model initialization time and memory
consumption. With current implement, model init time increased from ~4s
to ~9s, and memory consumption increased from ~2.8GB to ~4.8GB.
The memory increase is due to
1. in optimizer, after transpose the B weight, a in-memory tensor proto
is created using protobuf's arena.
2. in finalize step, when saving initializer and prepacking, ORT arena
is used to create buffers for initializers.

The memory allocated by arenas cannot be fully deallocated.
If disable ORT arena memory allocation, the memory consumptions of both
QDQ format and original format are ~2.2GB.
The time increase is mainly due to multiple memory copy, but can be
further optimized.

### Motivation and Context
Please see description for details.
---
 onnxruntime/core/mlas/inc/mlas_q4.h           |  26 +-
 onnxruntime/core/mlas/lib/q4_dq.cpp           | 268 ++++++++++--------
 .../python/onnxruntime_pybind_quant.cc        |   4 +-
 onnxruntime/test/mlas/bench/bench_q4dq.cpp    |  27 +-
 .../test/mlas/unittest/test_blockq4.cpp       |  17 +-
 5 files changed, 197 insertions(+), 145 deletions(-)

diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index 898fb23cf3e4f..aec14070ffd55 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -360,12 +360,12 @@ MlasDequantizeBlockwise(
     );
 
 /**
- * @brief Blockwise 2 bits or 4 bits quantization. After quantization, the weights and zero points
- *        are packed row-wise. In terms of the qbits type, dst and src have the same shape, and
- *        scales and zero_points have the same shape.
- *        columns must be multiple of 8 / qbits.
+ * @brief Blockwise 4 bits quantization. After quantization, the weights and zero points
+ *        are packed row-wise. If zero_points is null, quantized type is int4 with default
+ *        zero point 0, to align with DQ schema. Otherwise, quantized type is uint4.
+ *        In int4/uint4, dst have the same shape as src, and zero_points have the same shape as scales.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
  * @param src               points to the floating point matrix, to be quantized, row major shape [rows, columns]
  * @param scales            points to the scales matrix, row major
  * @param zero_points       points to the zero_points matrix, row major
@@ -376,9 +376,10 @@ MlasDequantizeBlockwise(
  * @param columns
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
+ * @return the quantized type is signed.
  */
 template <typename Tin, int qbits>
-void
+bool
 MlasQDQQuantizeBlockwise(
     const Tin* src,
     Tin* scales,
@@ -395,8 +396,17 @@ MlasQDQQuantizeBlockwise(
  * @brief Transpose blockwise quantized tensors. The src tensors are row major. src weights and zero
  *        points are packed row-wise. The dst tensors are column major. dst weights and zero points
  *        are packed column-wise.
+ *        dst_weights and dst_zero_points are in uint4.
+ *        If src_weights is int4 and has src_zero_points, src_weights and src_zero_points are
+ *        converted to uint4 by adding 8.
+ *        If src_weights is int4 and no src_zero_points, src_weights is converted to uint4 by adding 8.
+ *        src_zero_points is 0 and dst_zero_points is 8.
+ *        If src_weights is uint4 and has src_zero_points, just transpose.
+ *        If src_weights is uint4 and no src_zero_points, caller must allocate dst_zero_points with
+ *        0 values. Otherwise exception is thrown.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
+ * @tparam signed_quant     true when quantized type is signed, false when quantized type is unsigned
  * @param src_weights       points to the quantized matrix, row major, shape [rows, columns] in qbits type.
  *                          In uint8_t type, shape is [rows, columns * qbits / 8].
  * @param src_scales        points to the scales matrix, row major
@@ -410,7 +420,7 @@ MlasQDQQuantizeBlockwise(
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
  */
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 void
 MlasQDQTransposeBlockwiseQuantized(
     const uint8_t* src_weights,
diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp
index 62fe58ca333de..015d69de68766 100644
--- a/onnxruntime/core/mlas/lib/q4_dq.cpp
+++ b/onnxruntime/core/mlas/lib/q4_dq.cpp
@@ -314,14 +314,18 @@ struct Shape2D {
 };
 
 
-template <int qbits>
+template <int qbits, bool signed_quant>
 struct BitsTraits {
     static_assert(qbits <= 8, "Only BitsTraits are for small number of bits!");
 
     static constexpr int kBits = qbits;
-    static constexpr int kMax = (1 << qbits) - 1;
-    static constexpr int kMid = 1 << (qbits - 1);
+    static constexpr int kMax = signed_quant ? (1 << (qbits -1)) - 1 : (1 << qbits) - 1;
+    static constexpr int kMid = signed_quant ? 0 : (1 << (qbits - 1));
+    static constexpr int kMin = signed_quant ? -(1 << (qbits - 1)) : 0;
     static constexpr float kMaxFp = static_cast<float>(kMax);
+    static constexpr float kMinFp = static_cast<float>(kMin);
+    static constexpr float fullRange = kMaxFp - kMinFp;
+    static constexpr float halfRange = static_cast<float>(kMid - kMin);
 
     // number of qbit elements to pack into whole bytes
     static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0;
@@ -331,53 +335,54 @@ struct BitsTraits {
 
 /**
  * @brief Rectify min/max from a set of weights, and convert to scale and zero point
- *        for quantization
- * @tparam ScaleT   type of scale, usually floating point of various bits
- * @tparam qbits  number of int bits used for zero point value
+ *        for quantization.
+ * @tparam ScaleT        type of scale, usually floating point of various bits
+ * @tparam qbits         number of int bits used for zero point value
+ * @tparam signed_quant  output quantized type is signed
  * @param[in]   min
  * @param[in]   max
  * @param[out]  scale
  * @param[out]  zp
  */
-template <typename ScaleT, int qbits>
+template <typename ScaleT, int qbits, bool signed_quant>
 MLAS_FORCEINLINE
 void
 range2scalezp(float min, float max, ScaleT& scale, uint8_t& zp)
 {
-    constexpr int zp_max = BitsTraits<qbits>::kMax;
-    constexpr float zp_max_fp = BitsTraits<qbits>::kMaxFp;
-
     min = std::min(min, 0.0f);
     max = std::max(max, 0.0f);
 
-    float scale_f = (max - min) / zp_max;
+    float scale_f = (max - min) / BitsTraits<qbits, signed_quant>::fullRange;
 
     float zero_point_fp = min;
     if (scale_f != 0.0f) {
-        zero_point_fp = 0.f - min / scale_f;
+        zero_point_fp = BitsTraits<qbits, signed_quant>::kMinFp - min / scale_f;
     }
 
-    if (zero_point_fp < 0.0f) {
-        zp = 0;
-    } else if (zero_point_fp > zp_max_fp) {
-        zp = zp_max;
+    if (zero_point_fp < BitsTraits<qbits, signed_quant>::kMinFp) {
+        zp = static_cast<uint8_t>(BitsTraits<qbits, signed_quant>::kMin);
+    } else if (zero_point_fp > BitsTraits<qbits, signed_quant>::kMaxFp) {
+        zp = static_cast<uint8_t>(BitsTraits<qbits, signed_quant>::kMax);
     } else {
         zp = (uint8_t)roundf(zero_point_fp);
     }
     scale = ScaleT(scale_f);
 }
 
-template <typename ScaleT, int qbits>
+/**
+ * @brief Rectify min/max from a set of symmetric weights, and convert
+ *        to scale for quantization.
+ */
+template <typename ScaleT, int qbits, bool signed_quant>
 MLAS_FORCEINLINE
 void
 range2scale(float min, float max, ScaleT& scale)
 {
-    constexpr int mid_v = BitsTraits<qbits>::kMid;
-    constexpr float mid_fp = static_cast<float>(-mid_v);
-
     max = fabsf(max) > fabsf(min) ? max : min;
-
-    scale = ScaleT(max / mid_fp);
+    // !!Note: in the quantized space, abs of min -8 > abs of max 7.
+    // Therefore map the larger half FP space to [-8, 0].
+    // Minus sign achieves this purpose.
+    scale = ScaleT(-max / BitsTraits<qbits, signed_quant>::halfRange);
 };
 
 
@@ -400,7 +405,7 @@ struct BlockwiseQuantizer {
     static_assert(qbits == 4, "Only 4b block quantization is supported!");
 
     using QuantBlk = std::conditional_t<Columnwise, Shape2D<block_size, 1>, Shape2D<1, block_size>>;
-    using ThreadBlk = Shape2D<QuantBlk::kRow * BitsTraits<qbits>::kPackSize, QuantBlk::kColumn>;
+    using ThreadBlk = Shape2D<QuantBlk::kRow * BitsTraits<qbits, false>::kPackSize, QuantBlk::kColumn>;
 
     static
     MLAS_FORCEINLINE
@@ -474,8 +479,8 @@ struct BlockwiseQuantizer {
         MlasTryBatchParallel(
             thread_pool, total_thrd_blks,
             [&](ptrdiff_t block_idx) {
-                uint8_t zp_bytes[BitsTraits<qbits>::kPackSize];
-                std::fill_n(zp_bytes, BitsTraits<qbits>::kPackSize, (uint8_t)8);
+                uint8_t zp_bytes[BitsTraits<qbits, false>::kPackSize];
+                std::fill_n(zp_bytes, BitsTraits<qbits, false>::kPackSize, (uint8_t)8);
 
                 const int32_t r_blk_idx = static_cast<int32_t>(block_idx / thrd_col_blks);
                 const int32_t c_blk_idx = static_cast<int32_t>(block_idx % thrd_col_blks);
@@ -490,7 +495,7 @@ struct BlockwiseQuantizer {
                 const int meta_col = c / QuantBlk::kColumn;
 
                 // compute scale and zero point
-                for (int kpack = 0; kpack < BitsTraits<qbits>::kPackSize; kpack++) {
+                for (int kpack = 0; kpack < BitsTraits<qbits, false>::kPackSize; kpack++) {
 
                     // scan a single block to extract range [min, max]
                     float min = std::numeric_limits<float>::max();
@@ -509,9 +514,9 @@ struct BlockwiseQuantizer {
                     if (row_start < row_end) {
                         const int32_t meta_idx = meta_col * row_blks + meta_row + kpack;
                         if (zero_points == nullptr) {
-                            range2scale<ElementT, qbits>(min, max, scales[meta_idx]);
+                            range2scale<ElementT, qbits, false>(min, max, scales[meta_idx]);
                         } else {
-                            range2scalezp<ElementT, qbits>(min, max, scales[meta_idx], zp_bytes[kpack]);
+                            range2scalezp<ElementT, qbits, false>(min, max, scales[meta_idx], zp_bytes[kpack]);
                         }
                     }
                 }
@@ -533,7 +538,7 @@ struct BlockwiseQuantizer {
 
                         const float v0 = static_cast<float>(src[i * leadingDimension + j]);
                         const uint8_t vi0 = (uint8_t)std::clamp(roundf(v0 * reciprocal_scale + zp),
-                                                                0.0f, BitsTraits<qbits>::kMaxFp);
+                                                                0.0f, BitsTraits<qbits, false>::kMaxFp);
 
                         uint8_t vi1 = (uint8_t)zp;
                         if (i + 1 < r_end) {
@@ -545,7 +550,7 @@ struct BlockwiseQuantizer {
                             }
                             const float v1 = static_cast<float>(src[(i + 1) * leadingDimension + j]);
                             vi1 = (uint8_t)std::clamp(roundf(v1 * reciprocal_scale1 + zp1), 0.0f,
-                                                      BitsTraits<qbits>::kMaxFp);
+                                                      BitsTraits<qbits, false>::kMaxFp);
                         }
 
                         // !! 4b specific code
@@ -644,14 +649,19 @@ struct BlockwiseQuantizer {
  *        in memory are packed together, which means the packing is along the row. Quantized data
  *        are stored in row major, so the output tensor reserves same shape, in terms of qbits type,
  *        as the input tensor.
- * @tparam Tin    source data type, e.g. fp32/fp16
- * @tparam qbits  number of bits in each quantized element
+ *        If has zero points, quantized type is unsigned. Otherwise, quantized type is signed and the
+ *        zero point is 0.
+ *        The transposed outputs are used by MatMulNBits, so quant type becomes uint4 with default
+ *        zp at 8.
+ * @tparam Tin           source data type, e.g. fp32/fp16
+ * @tparam qbits         number of bits in each quantized element
+ * @tparam signed_quant  quantized type is signed
  */
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 struct BlockwiseQDQQuantizer;
 
-template <typename Tin>
-struct BlockwiseQDQQuantizer<Tin, 4> {
+template <typename Tin, bool signed_quant>
+struct BlockwiseQDQQuantizer<Tin, 4, signed_quant> {
     static MLAS_FORCEINLINE uint8_t GetElem(uint8_t val, int32_t idx)
     {
         return (val >> (idx << 2)) & 0xF;
@@ -663,9 +673,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         return ((val & 0xF) << shift) | (dst & (~(0xF << shift)));
     }
 
+    template <bool add8>
     static MLAS_FORCEINLINE uint8_t Pack(uint8_t v0, uint8_t v1)
     {
-        return (v0 & 0xF) | ((v1 & 0xF) << 4);
+        if constexpr (add8) {
+            return ((v0 & 0xF) ^ 8) | (((v1 & 0xF) ^ 8) << 4);
+        } else {
+            return (v0 & 0xF) | ((v1 & 0xF) << 4);
+        }
     }
 
     // If src is row major, then dst is column major. Transpose:
@@ -680,10 +695,16 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
     //  -->
     //  | dst0: low 4 bit | dst0: high 4 bit |
     //  | dst1: low 4 bit | dst1: high 4 bit |
+    template <bool add8>
     static MLAS_FORCEINLINE void Transpose(uint8_t src0, uint8_t src1, uint8_t& dst0, uint8_t& dst1)
     {
-        dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4);
-        dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0);
+        if constexpr (add8) {
+            dst0 = ((src0 & 0xF) ^ 8) | (((src1 & 0xF) ^ 8) << 4);
+            dst1 = (((src0 & 0xF0) ^ 0x80) >> 4) | ((src1 & 0xF0) ^ 0x80);
+        } else {
+            dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4);
+            dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0);
+        }
     }
 
     static MLAS_FORCEINLINE uint8_t QuantizeV(Tin src, float reciprocal_scale, uint8_t zero_point)
@@ -693,54 +714,12 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 static_cast<int32_t>(
                     std::roundf(static_cast<float>(src) * reciprocal_scale)
                 ) + static_cast<int32_t>(zero_point),
-                0,
-                BitsTraits<4>::kMax
+                BitsTraits<4, signed_quant>::kMin,
+                BitsTraits<4, signed_quant>::kMax
             )
         );
     }
 
-    /**
-     * @brief Quantize a matrix shape [rows, columns] row-wise. Scales and zero points are calculated.
-     *        Quantized data are packed row-wise based on qbits. Quantized data are stored in row
-     *        major, so the output tensor reserves the shape, in terms output type.
-     *        Thread block is [1, quant_block_size * 2].
-     * @param src               the source matrix, row major: [rows * columns]
-     * @param scales            the scales of quantized blocks, row major layout with shape:
-     *                          [rows * ceil(columns / quant_block_size)]
-     * @param zero_points       the zero points of quantized blocks, packed. Same shape as scales
-     *                          in terms of output type. In terms of uint8_t, the shape is:
-     *                          [ceil(rows * ceil(columns / quant_block_size) * qbits / 8)]
-     * @param dst               the quantized weights, row major: [rows * columns] in terms of
-     *                          output type. In terms of uint8_t, the shape is: [ceil(rows * columns * qbits / 8]
-     * @param rows              number of rows in the source matrix
-     * @param columns           number of columns in the source matrix, must satisfy
-     *                          ceil(columns / quant_block_size) % 2 == 0, so in each thread block,
-     *                          zero points are packed into one byte.
-     * @param quant_block_size  number of elements quantized together.
-     * @param thread_pool       thread pool for parallel processing
-     */
-    static void QuantizeRowWise(
-        const Tin* src,
-        Tin* scales,
-        uint8_t* zero_points,
-        uint8_t* dst,
-        int32_t rows,
-        int32_t columns,
-        int32_t quant_block_size,
-        MLAS_THREADPOOL* thread_pool
-    )
-    {
-        MLAS_UNREFERENCED_PARAMETER(src);
-        MLAS_UNREFERENCED_PARAMETER(scales);
-        MLAS_UNREFERENCED_PARAMETER(zero_points);
-        MLAS_UNREFERENCED_PARAMETER(dst);
-        MLAS_UNREFERENCED_PARAMETER(rows);
-        MLAS_UNREFERENCED_PARAMETER(columns);
-        MLAS_UNREFERENCED_PARAMETER(quant_block_size);
-        MLAS_UNREFERENCED_PARAMETER(thread_pool);
-        ORT_THROW("BlockwiseQDQQuantizer::BlockwiseQDQQuantizer is not implemented");
-    }
-
     /**
      * @brief Quantize a matrix shape [rows, columns] column-wise. Scales and zero points are calculated.
      *        Quantized data are packed row-wise based on qbits. Quantized data are stored in row major
@@ -769,6 +748,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         MLAS_THREADPOOL* thread_pool
     )
     {
+        ORT_ENFORCE(zero_points || signed_quant, "Unsigned quant with no zero points is not supported.");
         // Must avoid multiple thread write to a single byte, which means the starting index
         // of a thread block must be even. To achieve that, we need to customize the thread
         // block size based on the parity of columns.
@@ -815,6 +795,10 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         MLAS_THREADPOOL* thread_pool
     )
     {
+        ORT_ENFORCE(
+            src_zero_points || signed_quant || dst_zero_points,
+            "Unsigned quant types without zero points must allocate zero points with value 0."
+        );
         // Must avoid multiple thread write to a single byte, which means the starting index
         // of a thread block must be even. To achieve that, we need to customize the thread
         // block size based on the parity of columns.
@@ -896,15 +880,15 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
 
                 // calculate scale and zero point, and store
                 for (int32_t i = 0; i < col_size; i += 2) {
-                    v0_tt = v1_tt = BitsTraits<4>::kMid;
+                    v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid;
 
                     if (zero_points) {
-                        range2scalezp<Tin, 4>(vmin_t[i], vmax_t[i], scale0_tt, v0_tt);
-                        range2scalezp<Tin, 4>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt);
-                        zero_points[(scale_idx + i) >> 1] = Pack(v0_tt, v1_tt);
+                        range2scalezp<Tin, 4, signed_quant>(vmin_t[i], vmax_t[i], scale0_tt, v0_tt);
+                        range2scalezp<Tin, 4, signed_quant>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt);
+                        zero_points[(scale_idx + i) >> 1] = Pack<false>(v0_tt, v1_tt);
                     } else {
-                        range2scale<Tin, 4>(vmin_t[i], vmax_t[i], scale0_tt);
-                        range2scale<Tin, 4>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt);
+                        range2scale<Tin, 4, signed_quant>(vmin_t[i], vmax_t[i], scale0_tt);
+                        range2scale<Tin, 4, signed_quant>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt);
                     }
 
                     scales[scale_idx + i] = scale0_tt;
@@ -925,7 +909,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (int32_t i = 0; i < col_size; i += 2) {
                         v0_tt = QuantizeV(src[input_idx_t + i], reciprocal_scale_t[i], zp_t[i]);
                         v1_tt = QuantizeV(src[input_idx_t + i + 1], reciprocal_scale_t[i + 1], zp_t[i + 1]);
-                        dst[(input_idx_t + i) >> 1] = Pack(v0_tt, v1_tt);
+                        dst[(input_idx_t + i) >> 1] = Pack<false>(v0_tt, v1_tt);
                     }
                 }
             }
@@ -993,14 +977,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         int32_t col_idx = 0;
                         // leading unailgned zero points
                         if (scale_buffer_idx & 1) {
-                            v0_tt = BitsTraits<4>::kMid;
+                            v0_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[0], vmax_t[0], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[0], vmax_t[0], scale0_tt, v0_tt);
                                 zero_points[scale_buffer_idx >> 1] = SetElem(
                                     v0_tt, 1, zero_points[scale_buffer_idx >> 1]
                                 );
                             } else {
-                                range2scale<Tin, 4>(vmin_t[0], vmax_t[0], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[0], vmax_t[0], scale0_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1014,14 +998,16 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         }
                         // aligned zero points
                         for (; scale_buffer_idx < scale_buffer_idx_end - 1; col_idx += 2, scale_buffer_idx += 2) {
-                            v0_tt = v1_tt = BitsTraits<4>::kMid;
+                            v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
-                                range2scalezp<Tin, 4>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt);
-                                zero_points[scale_buffer_idx >> 1] = Pack(v0_tt, v1_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(
+                                    vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt
+                                );
+                                zero_points[scale_buffer_idx >> 1] = Pack<false>(v0_tt, v1_tt);
                             } else {
-                                range2scale<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
-                                range2scale<Tin, 4>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1037,14 +1023,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         }
                         // tailing unaligned elements
                         if (scale_buffer_idx < scale_buffer_idx_end) {
-                            v0_tt = BitsTraits<4>::kMid;
+                            v0_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
                                 zero_points[scale_buffer_idx >> 1] = SetElem(
                                     v0_tt, 0, zero_points[scale_buffer_idx >> 1]
                                 );
                             } else {
-                                range2scale<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1078,7 +1064,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                                     src[input_idx_t_start + 1], reciprocal_scale_t[col_idx + 1], zp_t[col_idx + 1]
                                 );
 
-                                dst[input_idx_t_start >> 1] = Pack(v0_tt, v1_tt);
+                                dst[input_idx_t_start >> 1] = Pack<false>(v0_tt, v1_tt);
                             }
                             // tailing unaligned output
                             if (input_idx_t_start < input_idx_t_end) {
@@ -1144,7 +1130,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     src0_t = src_weights[src_idx];
                     src1_t = src_weights[src_idx + packed_col_size];
                     src_idx += packed_col_size + packed_col_size;
-                    Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                    Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                     dst_weights[dst_idx] = dst0_t;
                     dst_weights[dst_idx + dstT_num_row] = dst1_t;
                 }
@@ -1152,7 +1138,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 if (src_idx < src_end_idx) {
                     src0_t = src_weights[src_idx];
                     src1_t = 0;
-                    Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                    Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                     dst_weights[dst_idx] = dst0_t;
                     dst_weights[dst_idx + dstT_num_row] = dst1_t;
                 }
@@ -1190,7 +1176,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (; src_idx < src_end_idx - packed_col_size; ++dst_idx) {
                         src0_t = src_zero_points[src_idx];
                         src1_t = src_zero_points[src_idx + packed_col_size];
-                        Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                        Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                         dst_zero_points[dst_idx] = dst0_t;
                         dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t;
                         src_idx += packed_col_size + packed_col_size;
@@ -1199,7 +1185,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     if (src_idx < src_end_idx) {
                         src0_t = src_zero_points[src_idx];
                         src1_t = 0;
-                        Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                        Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                         dst_zero_points[dst_idx] = dst0_t;
                         dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t;
                     }
@@ -1247,13 +1233,13 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 for (; src_idx < src_end_idx - columns; ++dst_idx) {
                     src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1);
                     src1_t = GetElem(src_weights[(src_idx + columns) >> 1], (src_idx + columns) & 1);
-                    dst_weights[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4);
+                    dst_weights[dst_idx] = Pack<signed_quant>(src0_t, src1_t);
                     src_idx += columns + columns;
                 }
 
                 if (src_idx < src_end_idx) {
                     src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1);
-                    dst_weights[dst_idx] = src0_t & 0xf;
+                    dst_weights[dst_idx] = Pack<signed_quant>(src0_t, 0);
                 }
             }
         );
@@ -1288,13 +1274,13 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (; src_idx < src_end_idx - columns; ++dst_idx) {
                         src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1);
                         src1_t = GetElem(src_zero_points[(src_idx + columns) >> 1], (src_idx + columns) & 1);
-                        dst_zero_points[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4);
+                        dst_zero_points[dst_idx] = Pack<signed_quant>(src0_t, src1_t);
                         src_idx += columns + columns;
                     }
 
                     if (src_idx < src_end_idx) {
                         src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1);
-                        dst_zero_points[dst_idx] = src0_t & 0xf;
+                        dst_zero_points[dst_idx] = Pack<signed_quant>(src0_t, 0);
                     }
                 }
             );
@@ -1745,7 +1731,7 @@ MlasDequantizeBlockwise<float, 4>(
 );
 
 template <typename Tin, int qbits>
-void
+bool
 MlasQDQQuantizeBlockwise(
     const Tin* src,
     Tin* scales,
@@ -1759,17 +1745,23 @@ MlasQDQQuantizeBlockwise(
 )
 {
     if (columnwise) {
-        BlockwiseQDQQuantizer<Tin, qbits>::QuantizeColumnWise(
-            src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
-        );
+        if (zero_points) {
+            BlockwiseQDQQuantizer<Tin, qbits, false>::QuantizeColumnWise(
+                src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
+            );
+            return false;
+        } else {
+            BlockwiseQDQQuantizer<Tin, qbits, true>::QuantizeColumnWise(
+                src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
+            );
+            return true;
+        }
     } else {
-        BlockwiseQDQQuantizer<Tin, qbits>::QuantizeRowWise(
-            src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
-        );
+        ORT_THROW("Row-wise MlasQDQQuantizeBlockwise is not implemented");
     }
 }
 
-template void
+template bool
 MlasQDQQuantizeBlockwise<float, 4>(
     const float* src,
     float* scales,
@@ -1782,7 +1774,7 @@ MlasQDQQuantizeBlockwise<float, 4>(
     MLAS_THREADPOOL* thread_pool
 );
 
-template void
+template bool
 MlasQDQQuantizeBlockwise<MLAS_FP16, 4>(
     const MLAS_FP16* src,
     MLAS_FP16* scales,
@@ -1795,7 +1787,7 @@ MlasQDQQuantizeBlockwise<MLAS_FP16, 4>(
     MLAS_THREADPOOL* thread_pool
 );
 
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 void
 MlasQDQTransposeBlockwiseQuantized(
     const uint8_t* src_weights,
@@ -1812,7 +1804,7 @@ MlasQDQTransposeBlockwiseQuantized(
 )
 {
     if (columnwise) {
-        BlockwiseQDQQuantizer<Tin, qbits>::TransposeColumnWiseQuantized(
+        BlockwiseQDQQuantizer<Tin, qbits, signed_quant>::TransposeColumnWiseQuantized(
             src_weights, src_scales, src_zero_points, dst_weights, dst_scales, dst_zero_points,
             rows, columns, quant_block_size, thread_pool
         );
@@ -1822,7 +1814,22 @@ MlasQDQTransposeBlockwiseQuantized(
 }
 
 template void
-MlasQDQTransposeBlockwiseQuantized<float, 4>(
+MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+    const uint8_t* src_weights,
+    const float* src_scales,
+    const uint8_t* src_zero_points,
+    uint8_t* dst_weights,
+    float* dst_scales,
+    uint8_t* dst_zero_points,
+    bool columnwise,
+    int rows,
+    int columns,
+    int quant_block_size,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
     const uint8_t* src_weights,
     const float* src_scales,
     const uint8_t* src_zero_points,
@@ -1837,7 +1844,22 @@ MlasQDQTransposeBlockwiseQuantized<float, 4>(
 );
 
 template void
-MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4>(
+MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4, true>(
+    const uint8_t* src_weights,
+    const MLAS_FP16* src_scales,
+    const uint8_t* src_zero_points,
+    uint8_t* dst_weights,
+    MLAS_FP16* dst_scales,
+    uint8_t* dst_zero_points,
+    bool columnwise,
+    int rows,
+    int columns,
+    int quant_block_size,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4, false>(
     const uint8_t* src_weights,
     const MLAS_FP16* src_scales,
     const uint8_t* src_zero_points,
diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
index 5e8e5c1a2a2fc..51a52af1b151e 100644
--- a/onnxruntime/python/onnxruntime_pybind_quant.cc
+++ b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -67,7 +67,7 @@ void QuantizeMatMul4BitsBlockwise(
 }
 
 template <typename T>
-void QuantizeQDQMatMul4BitsBlockwise(
+bool QuantizeQDQMatMul4BitsBlockwise(
     py::array_t<uint8_t> dst,          // shape: [K, N / 2]
     py::array_t<T> src,                // shape: [K, N]
     py::array_t<T> scale,              // shape: [block_per_K, N]
@@ -85,7 +85,7 @@ void QuantizeQDQMatMul4BitsBlockwise(
   py::buffer_info scale_buf = scale.request();
   py::buffer_info zp_buf = zero_points.request();
 
-  MlasQDQQuantizeBlockwise<T, 4>(
+  return MlasQDQQuantizeBlockwise<T, 4>(
       reinterpret_cast<const T*>(src_buf.ptr),
       reinterpret_cast<T*>(scale_buf.ptr),
       is_symmetric ? nullptr : reinterpret_cast<uint8_t*>(zp_buf.ptr),
diff --git a/onnxruntime/test/mlas/bench/bench_q4dq.cpp b/onnxruntime/test/mlas/bench/bench_q4dq.cpp
index 00234ecfd2ce2..9d15c9a6bf994 100644
--- a/onnxruntime/test/mlas/bench/bench_q4dq.cpp
+++ b/onnxruntime/test/mlas/bench/bench_q4dq.cpp
@@ -69,6 +69,7 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state
   int N = state.range(1);
   int quant_block_size = state.range(2);
   int threads = state.range(3);
+  bool add8 = state.range(4) != 0;
   int quant_num_M = (M + quant_block_size - 1) / quant_block_size;
   int blob_size = (quant_block_size + 1) / 2;
   size_t scale_size = quant_num_M * N;
@@ -87,12 +88,22 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state
       onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(),
                                                  tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
 
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(dst.data());
-    MlasQDQTransposeBlockwiseQuantized<float, 4>(
-        dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
-        true, M, N, quant_block_size, tp.get());
-    benchmark::ClobberMemory();
+  if (add8) {
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(dst.data());
+      MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+          dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
+          true, M, N, quant_block_size, tp.get());
+      benchmark::ClobberMemory();
+    }
+  } else {
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(dst.data());
+      MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+          dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
+          true, M, N, quant_block_size, tp.get());
+      benchmark::ClobberMemory();
+    }
   }
 }
 
@@ -113,6 +124,6 @@ BENCHMARK(BM_MlasQuantizeBlockwise)
 BENCHMARK(BM_QDQBlockwiseQuantizer_TransposeColumnwise)
     ->UseRealTime()
     ->Apply([](benchmark::internal::Benchmark* b) {
-      b->ArgNames({"M", "N", "quant_block_size", "threads"});
-      b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}});
+      b->ArgNames({"M", "N", "quant_block_size", "threads", "add8"});
+      b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}, {0, 1}});
     });
diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
index b466e883059f4..f75002f715154 100644
--- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp
+++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
@@ -127,13 +127,22 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
                                     columnwise, rows, columns, columns, threadpool_ptr);
 
     if (columnwise) {
-      MlasQDQQuantizeBlockwise<float, 4>(
+      bool signed_quant = MlasQDQQuantizeBlockwise<float, 4>(
           transposed, qdq_scales, qdq_zp, qdq_weights,
           true, rows, columns, block_size, threadpool_ptr);
 
-      MlasQDQTransposeBlockwiseQuantized<float, 4>(
-          qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
-          true, rows, columns, block_size, threadpool_ptr);
+      ASSERT_EQ(symmetric, signed_quant) << "symmetric quantization should be signed";
+
+      if (symmetric) {
+        MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+            true, rows, columns, block_size, threadpool_ptr);
+
+      } else {
+        MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+            true, rows, columns, block_size, threadpool_ptr);
+      }
     }
 
     for (int c = 0; c < columns; c++) {

From cf565e955db71f90060ecd2e07bc4e34a8d5600f Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Mon, 15 Jul 2024 17:56:08 -0700
Subject: [PATCH 06/35] Revert "Fix ETW Sink Initialize unproperly locking"
 (#21360)

Reverts microsoft/onnxruntime#21226

Causes any onnxruntime app to hang on Windows ARM64. Our pipelines do
not have the same ETW environment, so we couldn't catch it.


![image](https://github.com/user-attachments/assets/80edbf7d-be50-4cb0-a016-f390b81dc798)
The call to TraceLoggingRegisterEx() recursively calls back into
LazyInitialize():
LazyInitialize() -> TraceLoggingRegisterEx() ->
ORT_TL_EtwEnableCallback() -> Instance() -> LazyInitialize()

The original code got out of the recursive loop by checking the
`initialized_` flag.
---
 .../core/platform/windows/logging/etw_sink.cc | 22 ++++++++++++++++---
 .../core/platform/windows/logging/etw_sink.h  |  4 ++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 2a74e22850658..b0f9eaf4f62d2 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -98,6 +98,10 @@ ULONGLONG EtwRegistrationManager::Keyword() const {
   return keyword_;
 }
 
+HRESULT EtwRegistrationManager::Status() const {
+  return etw_status_;
+}
+
 void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
   std::lock_guard<OrtMutex> lock(callbacks_mutex_);
   callbacks_.push_back(&callback);
@@ -140,9 +144,15 @@ EtwRegistrationManager::EtwRegistrationManager() {
 }
 
 void EtwRegistrationManager::LazyInitialize() {
-  static HRESULT etw_status = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
-  if (FAILED(etw_status)) {
-    ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status));
+  if (!initialized_) {
+    std::lock_guard<OrtMutex> lock(init_mutex_);
+    if (!initialized_) {  // Double-check locking pattern
+      initialized_ = true;
+      etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+      if (FAILED(etw_status_)) {
+        ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_));
+      }
+    }
   }
 }
 
@@ -161,6 +171,12 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
   // register on first usage
   static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance();
 
+  // do something (not that meaningful) with etw_manager so it doesn't get optimized out
+  // as we want an instance around to do the unregister
+  if (FAILED(etw_manager.Status())) {
+    return;
+  }
+
   // TODO: Validate if this filtering makes sense.
   if (message.DataType() == DataType::USER) {
     return;
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index ff68aec0b7d64..3af45b813a625 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -66,6 +66,9 @@ class EtwRegistrationManager {
   // Get the current keyword
   uint64_t Keyword() const;
 
+  // Get the ETW registration status
+  HRESULT Status() const;
+
   void RegisterInternalCallback(const EtwInternalCallback& callback);
 
   void UnregisterInternalCallback(const EtwInternalCallback& callback);
@@ -97,6 +100,7 @@ class EtwRegistrationManager {
   bool is_enabled_;
   UCHAR level_;
   ULONGLONG keyword_;
+  HRESULT etw_status_;
 };
 
 }  // namespace logging

From 4005d12ed4c966fe2be30849502607d5470fa651 Mon Sep 17 00:00:00 2001
From: George Wu <jywu@microsoft.com>
Date: Mon, 15 Jul 2024 19:34:08 -0700
Subject: [PATCH 07/35] add vitisai ep build stage to Windows CPU Pipeline
 (#21361)

We need to prevent VitisAI EP build breaks, add a stage in Windows CPU
CI Pipeline to build Vitis AI EP on Windows.
There are no external dependencies for builds. Tests have to be disabled
though as the EP has external SW/HW dependencies.
This will at least allow us to prevent build breaks which has happened
on multiple occasions recently.

tested
https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1432346&view=results
and it seems to run fine.
---
 .../azure-pipelines/win-ci-pipeline.yml       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 39e68f5631f01..7d64f78c695fa 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -137,6 +137,25 @@ stages:
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
+# Build only. Does not run any tests.
+- stage: x64_release_vitisai
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        buildArch: x64
+        additionalBuildFlags: --build_wheel --use_vitisai
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_release
+        RunOnnxRuntimeTests: false
+        isTraining: false
+        ORT_EP_NAME: VITISAI
+        GenerateDocumentation: false
+        WITH_CACHE: false
+        MachinePool: 'onnxruntime-Win-CPU-2022'
+
 - stage: x64_release_winml
   dependsOn: []
   jobs:

From 218301403d76c3dbd7db4731071876273dfaa40d Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Mon, 15 Jul 2024 22:30:20 -0700
Subject: [PATCH 08/35] Add ML Program support for  basic activation ops
 (#21326)

### Description

Add support for:

- Sigmoid
- Relu
- Tanh

### Motivation and Context
Enable support for Autodesk model
---
 .../builders/impl/activation_op_builder.cc    | 93 +++++++++++++------
 .../apple/coreml_supported_mlprogram_ops.md   |  2 +
 2 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index dee87ce3632a8..0e21715513707 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -26,6 +26,8 @@ class ActivationOpBuilder : public BaseOpBuilder {
                          const logging::Logger& logger) const override;
 
   int GetMinSupportedOpSet(const Node& node) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ActivationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
@@ -74,33 +76,61 @@ Status AddPReluWeight(ModelBuilder& model_builder, const Node& node,
 Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                   const Node& node,
                                                   const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   const auto& op_type(node.OpType());
-  if (op_type == "Sigmoid") {
-    layer->mutable_activation()->mutable_sigmoid();
-  } else if (op_type == "Tanh") {
-    layer->mutable_activation()->mutable_tanh();
-  } else if (op_type == "Relu") {
-    layer->mutable_activation()->mutable_relu();
-  } else if (op_type == "PRelu") {
-    auto* prelu = layer->mutable_activation()->mutable_prelu();
-    ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu));
-  } else if (op_type == "LeakyRelu") {
-    NodeAttrHelper helper(node);
-    const auto alpha = helper.Get("alpha", 0.01f);
-
-    auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu();
-    leaky_relu->set_alpha(alpha);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
+    std::string_view coreml_op_type;
+    if (op_type == "Sigmoid") {
+      coreml_op_type = "sigmoid";
+    } else if (op_type == "Tanh") {
+      coreml_op_type = "tanh";
+    } else if (op_type == "Relu") {
+      coreml_op_type = "relu";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "Sigmoid") {
+      layer->mutable_activation()->mutable_sigmoid();
+    } else if (op_type == "Tanh") {
+      layer->mutable_activation()->mutable_tanh();
+    } else if (op_type == "Relu") {
+      layer->mutable_activation()->mutable_relu();
+    } else if (op_type == "PRelu") {
+      auto* prelu = layer->mutable_activation()->mutable_prelu();
+      ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu));
+    } else if (op_type == "LeakyRelu") {
+      NodeAttrHelper helper(node);
+      const auto alpha = helper.Get("alpha", 0.01f);
+
+      auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu();
+      leaky_relu->set_alpha(alpha);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
+
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -165,9 +195,20 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para
 bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                             const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
-  if (op_type == "PRelu") {
-    return IsPReluOpSupported(node, input_params, logger);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram) {
+    if (op_type == "PRelu" || op_type == "LeakyRelu") {
+      return false;
+    }
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    if (op_type == "PRelu") {
+      return IsPReluOpSupported(node, input_params, logger);
+    }
   }
+
   return true;
 }
 
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index dec05ae066a4a..9961d2668e6f5 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -18,3 +18,5 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Sub||
+|ai.onnx:Sigmoid||
+|ai:onnx:Tanh||

From 8568a676734c97790c9fbf918b5f3fc00e271e80 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jul 2024 10:05:33 -0700
Subject: [PATCH 09/35] Fix a build error when CUDA is enabled and
 onnxruntime_DISABLE_CONTRIB_OPS is ON (#21285)

Resolve #21204

To reproduce the issue, build the code with
```
python3 tools/ci_build/build.py --build_dir /tmp/build13  --config Debug --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_csharp --enable_onnx_tests --update --build --build_wheel --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/local/cuda --cmake_extra_defines onnxruntime_DISABLE_CONTRIB_OPS=ON onnxruntime_BUILD_UNIT_TESTS=OFF --skip_tests
```
Then run the following python script:

```python
#!/usr/bin/python3

import onnxruntime as ort
providers = [("CUDAExecutionProvider")]
ort_sess = ort.InferenceSession('/data/onnx/opset17/test_gemm_default_no_bias/model.onnx', providers=providers)
```

Without this fix, you will see an error:

Failed to load library libonnxruntime_providers_cuda.so with error:
/tmp/build18/Debug/onnxruntime/capi/libonnxruntime_providers_cuda.so:
undefined symbol:
_ZN11onnxruntime4cuda21BuildKernelCreateInfoINS0_57kCudaExecutionProvider_GridSample_kOnnxDomain_ver16_floatEEENS_16KernelCreateInfoEv
---
 .../cuda => core/providers/cuda/tensor}/grid_sample.cc            | 0
 .../cuda => core/providers/cuda/tensor}/grid_sample.h             | 0
 .../cuda => core/providers/cuda/tensor}/grid_sample_impl.cu       | 0
 .../cuda => core/providers/cuda/tensor}/grid_sample_impl.h        | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename onnxruntime/{contrib_ops/cuda => core/providers/cuda/tensor}/grid_sample.cc (100%)
 rename onnxruntime/{contrib_ops/cuda => core/providers/cuda/tensor}/grid_sample.h (100%)
 rename onnxruntime/{contrib_ops/cuda => core/providers/cuda/tensor}/grid_sample_impl.cu (100%)
 rename onnxruntime/{contrib_ops/cuda => core/providers/cuda/tensor}/grid_sample_impl.h (100%)

diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.cc
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.cc
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/core/providers/cuda/tensor/grid_sample.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.h
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h

From 5df4ddd1c3d5686fb8a57e439b7162a447fcfdce Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Tue, 16 Jul 2024 10:34:19 -0700
Subject: [PATCH 10/35] matmul 4bit tool chain support qdq (#21362)

### Description
This is a partial change ported from fajin/qdqmatmulnbitstoolchain. That
branch has issues resolving the web CI.

MatMulNBits is a heavily optimized matmul operation. Currently a MatMul
can be converted to MatMulNBits to speed up the model inference.
However, MatMulNBits is an ORT only op. To make the graph compatible
with ONNX ops and utilize MatMulNBits at the same time, we introduce
Q/DQ support for MatMulNBits.

To convert MatMul ops in a model to MatMulNBits:

use matmul_4bits_quantizer.py to convert MatMul to DQ + MatMul using QDQ
mode.
In ORT session, DQ + MatMul is fused to MatMulNBits

#### Note
MatMulNBits assume B weight is uint4. When no zp is provided, zp
defaults to 8, which is different from DQ. DQ defaults zp to 0 when no
zp provided. And DQ supports int4. Therefore some conversions are
introduced during DQ + MatMul --> MatMulNBits step.

#### Perf
Using QDQ format will increase the model initialization time and memory
consumption. With current implement, model init time increased from ~4s
to ~9s, and memory consumption increased from ~2.8GB to ~4.8GB.
The memory increase is due to
1. in optimizer, after transpose the B weight, a in-memory tensor proto
is created using protobuf's arena.
2. in finalize step, when saving initializer and prepacking, ORT arena
is used to create buffers for initializers.

The memory allocated by arenas cannot be fully deallocated.
If disable ORT arena memory allocation, the memory consumptions of both
QDQ format and original format are ~2.2GB.
The time increase is mainly due to multiple memory copy, but can be
further optimized.

### Motivation and Context
Please see description for details.
---
 .../quantization/matmul_4bits_quantizer.py    | 277 +++++++++++++-----
 .../quantization/test_op_matmul_4bits.py      |  54 +++-
 2 files changed, 246 insertions(+), 85 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 11a830dc6d7f5..40a4a4d26dc1c 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -18,31 +18,36 @@
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
 from packaging import version
 
-from onnxruntime.capi._pybind_state import quantize_matmul_4bits
+from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
 
 from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg
+from .quant_utils import QuantFormat, attribute_to_kwarg
 
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class WeightOnlyQuantConfig:
-    def __init__(self, algorithm):
+    def __init__(self, algorithm, quant_format):
         """This is the Base class for Weight Only Quant Configuration.
 
         Args:
             algorithm:
                 weight only quantize algorithm name.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
         """
         self.algorithm = algorithm
+        self.quant_format = quant_format
 
 
 class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
     def __init__(
         self,
         ratios=None,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
@@ -51,11 +56,18 @@ def __init__(
         Args:
             ratios:
                 percentile of clip. Defaults to {}.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
+
         if ratios is None:
             ratios = {}
         super().__init__(
             algorithm="RTN",
+            quant_format=quant_format,
         )
         self.ratios = ratios
 
@@ -69,6 +81,7 @@ def __init__(
         actorder=False,
         mse=False,
         perchannel=True,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for GPTQ algorithm Weight Only Quant Configuration.
@@ -87,9 +100,16 @@ def __init__(
                 whether get scale and zero point with mse error.
             perchannel (bool, optional):
                 whether quantize weight per-channel.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format"
+
         super().__init__(
             algorithm="GPTQ",
+            quant_format=quant_format,
         )
         self.calibration_data_reader = calibration_data_reader
         self.percdamp = percdamp
@@ -105,6 +125,7 @@ def __init__(
         block_size=128,
         bits=4,
         axis=1,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for HQQ algorithm Weight Only Quant Configuration.
@@ -112,14 +133,21 @@ def __init__(
 
         Args:
             block_size (int, optional):
-                channel number in one block to execute a GPTQ quantization iteration.
+                channel number in one block to execute a HQQ quantization iteration.
             bits (int, optional):
                 how many bits to represent weight.
             axis (int, optional):
                 0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format"
+
         super().__init__(
             algorithm="HQQ",
+            quant_format=quant_format,
         )
         self.block_size = block_size
         self.bits = bits
@@ -132,8 +160,26 @@ def __init__(
         block_size: int = 128,
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
+        quant_format=QuantFormat.QOperator,
     ):
-        super().__init__(algorithm="DEFAULT")
+        """
+        This is a class for weight only affine quantization configuration.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute an affine quantization iteration.
+            is_symmetric (bool, optional):
+                whether quantize weight symmetrically.
+            accuracy_level (int, optional):
+                Accuracy level of the 4-bit quantized MatMul computation.
+                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
+                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        super().__init__(algorithm="DEFAULT", quant_format=quant_format)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.bits = 4
@@ -287,23 +333,26 @@ def quantize_internal(
 
         return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
         import torch
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        b_pb, bs_graph = get_initializer(inputB, graph_stack)
+        input_b = node.input[1]
+        b_pb, bs_graph = get_initializer(input_b, graph_stack)
         if b_pb is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
         b_array = onnx.numpy_helper.to_array(b_pb)
         if len(b_array.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
+            return [node]  # can only process 2-D matrix
         b_array_torch = torch.from_numpy(b_array)
         if torch.cuda.is_available():
             b_array_torch = b_array_torch.cuda()
@@ -334,7 +383,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
         b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
         b_quant.name = b_pb.name + "_Q4"
         for input in bs_graph.input:
-            if input.name == inputB:
+            if input.name == input_b:
                 bs_graph.input.remove(input)
                 break
 
@@ -366,7 +415,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
 
         logger.info(f"complete quantization of {node.name} ...")
 
-        return matmul_q4_node
+        return [matmul_q4_node]
 
 
 def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -382,7 +431,7 @@ class DefaultWeightOnlyQuantizer:
     def __init__(self, config: DefaultWeightOnlyQuantConfig):
         self.config = config
 
-    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
+    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """4b quantize fp32 weight to a blob"""
 
         if len(fp32weight.shape) != 2:
@@ -390,83 +439,136 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         rows, cols = fp32weight.shape
 
         block_size = self.config.block_size
-        blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
-        padded_rows = k_blocks * block_size
-        pad_len = padded_rows - rows
-        if pad_len > 0:
-            fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
 
-        # block wise quantization, each block comes from a single column
-        packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
-        scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-        zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
-        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric)
+        if self.config.quant_format == QuantFormat.QOperator:
+            blob_size = block_size // 2
+            padded_rows = k_blocks * block_size
+            pad_len = padded_rows - rows
+            if pad_len > 0:
+                fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
+
+            # block wise quantization, each block comes from a single column
+            packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
+            zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
+            scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
+            quantize_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
+        else:
+            packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
+            zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
+            scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
+            quantize_qdq_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
 
         return (packed, scales, zero_point)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
 
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = get_initializer(inputB, graph_stack)  # noqa: N806
-        if B is None:
+        qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
+        input_b = node.input[1]
+        b_tensor, b_graph = get_initializer(input_b, graph_stack)
+        if b_tensor is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
-        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
-        if len(B_array.shape) != 2:
+        b_ndarray = onnx.numpy_helper.to_array(b_tensor)
+        if len(b_ndarray.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
-
-        packed, scales, zero_points = self.int4_block_quant(B_array)
-        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
-        B_quant.name = B.name + "_Q4"
-        for input in Bs_graph.input:
-            if input.name == inputB:
-                Bs_graph.input.remove(input)
-                break
+            return [node]  # can only process 2-D matrix
 
-        scales_tensor = onnx.numpy_helper.from_array(scales)
-        scales_tensor.name = B.name + "_scales"
-        Bs_graph.initializer.extend([B_quant, scales_tensor])
+        packed, scales, zero_points = self.int4_block_quant(b_ndarray)
 
-        input_names = [node.input[0], B_quant.name, scales_tensor.name]
-        if not self.config.is_symmetric:
-            zp_tensor = onnx.numpy_helper.from_array(zero_points)
-            zp_tensor.name = B.name + "_zero_points"
-            Bs_graph.initializer.extend([zp_tensor])
-            input_names.append(zp_tensor.name)
+        if self.config.quant_format == QuantFormat.QOperator:
+            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
+        else:
+            b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
 
-        kwargs = {}
-        rows, cols = B_array.shape
-        kwargs["K"] = rows
-        kwargs["N"] = cols
-        kwargs["bits"] = 4
-        kwargs["block_size"] = self.config.block_size
-        if self.config.accuracy_level is not None:
-            kwargs["accuracy_level"] = self.config.accuracy_level
+        for input in b_graph.input:
+            if input.name == input_b:
+                b_graph.input.remove(input)
+                break
 
-        matmul_q4_node = onnx.helper.make_node(
-            "MatMulNBits",
-            inputs=input_names,
-            outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
-            domain="com.microsoft",
-            **kwargs,
-        )
+        b_graph.initializer.extend([b_quant, scales_tensor])
+
+        output_nodes = []
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            input_names = [node.input[0], b_quant.name, scales_tensor.name]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points")
+                input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            kwargs = {}
+            rows, cols = b_ndarray.shape
+            kwargs["K"] = rows
+            kwargs["N"] = cols
+            kwargs["bits"] = 4
+            kwargs["block_size"] = self.config.block_size
+            if self.config.accuracy_level is not None:
+                kwargs["accuracy_level"] = self.config.accuracy_level
+
+            matmul_q4_node = onnx.helper.make_node(
+                "MatMulNBits",
+                inputs=input_names,
+                outputs=[node.output[0]],
+                name=node.name + "_Q4" if node.name else "",
+                domain="com.microsoft",
+                **kwargs,
+            )
 
-        logger.info(f"complete quantization of {node.name} ...")
+            output_nodes.append(matmul_q4_node)
+        else:
+            dq_input_names = [b_quant.name, scales_tensor.name]
+            dq_output_names = [b_quant.name + "_output"]
+            matmul_input_names = [node.input[0], dq_output_names[0]]
+            matmul_output_names = [node.output[0]]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.helper.make_tensor(
+                    b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True
+                )
+                dq_input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
+            dq_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                inputs=dq_input_names,
+                outputs=dq_output_names,
+                name=node.name + "_DQ_Q4" if node.name else "",
+                **dq_kwargs,
+            )
+            matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=matmul_input_names,
+                outputs=matmul_output_names,
+                name=node.name + "_matmul_Q4" if node.name else "",
+            )
+            output_nodes.extend([dq_node, matmul_node])
 
-        return matmul_q4_node
+        logger.info(f"complete quantization of {node.name} ...")
+        return output_nodes
 
 
 class MatMul4BitsQuantizer:
-    """Perform 4b quantization of constant MatMul weights"""
+    """
+    Perform 4b quantization of constant MatMul weights.
+    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
+    MatMul node.
+    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
+    replaced by the DequantizeLinear + MatMul nodes.
+    """
 
     def __init__(
         self,
@@ -475,7 +577,8 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
         nodes_to_exclude=None,
-        algo_config: WeightOnlyQuantConfig = None,
+        quant_format=QuantFormat.QOperator,
+        algo_config: WeightOnlyQuantConfig | None = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
@@ -488,7 +591,10 @@ def __init__(
         self.node_quantizer = None
         if algo_config is None:
             algo_config = DefaultWeightOnlyQuantConfig(
-                block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level
+                block_size=block_size,
+                is_symmetric=is_symmetric,
+                accuracy_level=accuracy_level,
+                quant_format=quant_format,
             )
         self.algo_config = algo_config
         if algo_config.algorithm == "HQQ":
@@ -526,15 +632,15 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
                 node = onnx.helper.make_node(  # noqa: PLW2901
                     node.op_type, node.input, node.output, name=node.name, **kwargs
                 )
-            out_node = None
+            out_nodes = []
             if node.name in self.nodes_to_exclude:
                 logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
-                out_node = node
+                out_nodes = [node]
             elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
-                out_node = self.node_quantizer.quantize(node, graph_stack)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
             else:
-                out_node = self.node_quantizer.quantize(node, graph_stack)
-            new_nodes.append(out_node)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.extend(out_nodes)
 
         graph.ClearField("node")
         graph.node.extend(new_nodes)
@@ -688,6 +794,15 @@ def parse_args():
         default=[],
         help="Specify the nodes to be excluded from quantization with node names",
     )
+    parser.add_argument(
+        "--quant_format",
+        default="QOperator",
+        type=QuantFormat,
+        choices=list(QuantFormat),
+        help="QuantFormat {QOperator, QDQ}"
+        "QOperator format quantizes the model with quantized operators directly."
+        "QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.",
+    )
 
     return parser.parse_args()
 
@@ -699,6 +814,7 @@ def parse_args():
 
     input_model_path = args.input_model
     output_model_path = args.output_model
+    quant_format = args.quant_format
 
     if os.path.exists(output_model_path):
         logger.error(f"file {output_model_path} already exists")
@@ -713,7 +829,10 @@ def parse_args():
         quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
     elif args.quant_method == "default":
         quant_config = DefaultWeightOnlyQuantConfig(
-            block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level
+            block_size=args.block_size,
+            is_symmetric=args.symmetric,
+            accuracy_level=args.accuracy_level,
+            quant_format=quant_format,
         )
     elif args.quant_method == "rtn":
         quant_config = RTNWeightOnlyQuantConfig()
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 88e5052db4e2e..4cc8a0c151d14 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -14,7 +14,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import quant_utils
 
@@ -105,8 +105,9 @@ def make_matmul(
             [output_tensor],
             initializer=initializers,
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
+        # blocked quantization requires DQ op set >= 21
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
+        model.ir_version = 10  # use stable onnx ir version
 
         onnx.save(model, output_model_path)
 
@@ -116,9 +117,12 @@ def quant_test(
         data_reader: TestDataFeeds,
         block_size: int,
         is_symmetric: bool,
+        quant_format: quant_utils.QuantFormat = quant_utils.QuantFormat.QOperator,
     ):
+        use_qdq = quant_format == quant_utils.QuantFormat.QDQ
+        name_prefix = "DQ_MatMul" if use_qdq else "MatMulNBits"
         model_int4_path = str(
-            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+            Path(self._tmp_model_dir.name).joinpath(f"{name_prefix}_{block_size}_{is_symmetric}.onnx").absolute()
         )
 
         # Quantize fp32 model to int4 model
@@ -126,15 +130,33 @@ def quant_test(
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
         quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
-            block_size=block_size, is_symmetric=is_symmetric
+            block_size=block_size, is_symmetric=is_symmetric, quant_format=quant_format
         )
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
-        quant_nodes = {"MatMulNBits": 1}
+        quant_nodes = {"DequantizeLinear": 1, "MatMul": 1} if use_qdq else {"MatMulNBits": 1}
         check_op_type_count(self, model_int4_path, **quant_nodes)
 
+        if use_qdq:
+            dq_qtype = TensorProto.INT4 if is_symmetric else TensorProto.UINT4
+            dqnode_io_qtypes = (
+                {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                    ]
+                }
+                if is_symmetric
+                else {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                        ["i", 2, dq_qtype],
+                    ]
+                }
+            )
+            check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes)
+
         data_reader.rewind()
 
         try:
@@ -211,6 +233,26 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_symmetric_qdq(self):
+        np.random.seed(13)
+
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=True)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, True, quant_utils.QuantFormat.QDQ)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_offsets_qdq(self):
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, False, quant_utils.QuantFormat.QDQ)
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )

From dcc04367b77136f67d4eb8b9927f4cbd742f0204 Mon Sep 17 00:00:00 2001
From: Yueqing Zhang <yuz75@Pitt.edu>
Date: Wed, 17 Jul 2024 04:48:29 +0800
Subject: [PATCH 11/35] [VitisAI] fix graph save (#21293)

### Description
<!-- Describe your changes. -->
Revert the wrong change in
https://github.com/microsoft/onnxruntime/pull/20920


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
It would save the data at a wrong position
---
 onnxruntime/core/providers/vitisai/imp/global_api.cc | 6 +++---
 onnxruntime/core/providers/vitisai/imp/graph.cc      | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 1a3cc5979ff5a..8c1dce0d3dc1a 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -126,7 +126,7 @@ static std::string config_to_json_str(const onnxruntime::ProviderOptions& config
 
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
-  auto model_path = PathToUTF8String(ToPathString(graph_viewer.ModelPath().string()));
+  auto model_path = graph_viewer.ModelPath().string();
   if (s_library_vitisaiep.compile_onnx_model_with_options) {
     return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
   } else {
@@ -227,9 +227,9 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto& logger = logging::LoggingManager::DefaultLogger();
     auto& model = const_cast<onnxruntime::Model&>(const_model);
     auto model_proto = model.ToProto();
-    auto file_path = ToPathString(model.MainGraph().ModelPath().string());
+    auto file_path = model.MainGraph().ModelPath();
     auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()};
-    auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger);
+    auto ret = Model::Create(std::move(*model_proto), ToPathString(file_path), &local_registries, logger);
     auto status = ret->MainGraph().Resolve();
     vai_assert(status.IsOK(), status.ErrorMessage());
     return ret.release();
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 40b396fda6135..3f46fbde8c714 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -107,12 +107,11 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   auto graph_proto_subgraph = graph.ToGraphProto();
   *model_proto->mutable_graph() = *graph_proto_subgraph;
   auto& logger = logging::LoggingManager::DefaultLogger();
-  auto filename_data_relative_path = std::filesystem::path();
   auto model = Model::Create(std::move(*model_proto), ToPathString(filename), nullptr, logger);
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(filename_dat, graph.ModelPath(), initializer_size_threshold);
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
@@ -124,7 +123,7 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
       *prop->mutable_value() = m.second;
     }
   }
-  std::fstream output(filename, std::ios::out | std::ios::trunc | std::ios::binary);
+  std::fstream output(ToPathString(filename), std::ios::out | std::ios::trunc | std::ios::binary);
   bool result = model_proto->SerializeToOstream(output);
   output << std::flush;
   vai_assert(result, "model serialize to ostream error");

From 760a31c8481922cbc12fa6e0b2d13f67c175aaa9 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 16 Jul 2024 15:58:11 -0700
Subject: [PATCH 12/35] Exclude blkq4_fp16_gemm_sm80_test in cuda 12.5 build
 (#21373)

There is build errors when build with CUDA 12.5 and
`--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON`.

Temporally exclude blkq4_fp16_gemm_sm80_test to unblock cuda 12.5 build.
---
 .../test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h    | 3 ++-
 .../providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc   | 3 ++-
 .../providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 5 +++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index fa1c739c04e3a..f96c8ce9ce729 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -13,7 +13,7 @@
  */
 
 #pragma once
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "test/cuda_host/blkq4_fp16_quant_sm80.h"
 
 #include <random>
@@ -197,3 +197,4 @@ void run_blkq4_small_gemm(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index b95e093e41eab..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -10,7 +10,7 @@
  *   This part requires gtest header files, which do not play
  *   well with CUTLASS headers.
  */
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include "gtest/gtest.h"
@@ -341,3 +341,4 @@ TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
 
 }  // namespace test
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index f5600ca9885a3..8b27c3d8c3aed 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,6 +11,9 @@
  *   well with gtest headers.
  */
 
+// This test has build error with cuda 12.5
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
+
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include <random>
@@ -532,3 +535,5 @@ template void run_blkq4_small_gemm<128, false, false>(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+
+#endif

From fa287042ca05debb35b07f65147a9c3a39330231 Mon Sep 17 00:00:00 2001
From: vraspar <vrajang@outlook.com>
Date: Tue, 16 Jul 2024 16:34:58 -0700
Subject: [PATCH 13/35] Add ML Program support for transpose op (#21364)

### Description
Add support for transpose op


### Motivation and Context
Enable support for Autodesk model
---
 .../builders/impl/transpose_op_builder.cc     | 28 +++++++++++++++----
 .../apple/coreml_supported_mlprogram_ops.md   |  1 +
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index f6a61d55a3d63..831c4cf4d08ba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -14,13 +15,13 @@ namespace coreml {
 class TransposeOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                  const Node& node,
                                                  const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   NodeAttrHelper helper(node);
   std::vector<int64_t> perm = helper.Get("perm", std::vector<int64_t>());
   std::vector<int64_t> input_shape;
@@ -33,12 +34,27 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
   }
 
-  *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "transpose");
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationInput(*op, "perm", model_builder.AddConstant(op->type(), "perm", perm));
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
 
-  model_builder.AddLayer(std::move(layer));
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 9961d2668e6f5..1bbb933f66ba4 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -20,3 +20,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||
+|ai:onnx:Transpose||

From 0f4c39ec47d7152520d5389aa3ac821db82fab0a Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 17 Jul 2024 07:35:12 -0700
Subject: [PATCH 14/35] [ROCM] adjust test_flash_attn_rocm test tolerance
 (#21379)

The test_flash_attn_rocm.py from
https://github.com/microsoft/onnxruntime/pull/21032 failed frequently.
For example, I saw two failed jobs today:
E           Max absolute difference: 0.002167
E           Max absolute difference: 0.002686

Adjust the abs threshold from 0.002 to 0.005, and use default relative tolerance rtol=0.001.
---
 .../python/transformers/test_flash_attn_rocm.py  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
index fe7e39722237f..880f4175e00b7 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
@@ -35,8 +35,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_prompt_no_buff(
             config,
@@ -45,8 +45,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
     @parameterized.expand(gqa_past_flash_attention_test_cases())
@@ -67,8 +67,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_past_no_buff(
             config,
@@ -77,8 +77,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
 
From 6c7562b09731723d015aa9bc27abc069467d03ff Mon Sep 17 00:00:00 2001
From: Ranjit Ranjan <165394499+ranjitshs@users.noreply.github.com>
Date: Thu, 18 Jul 2024 01:07:06 +0530
Subject: [PATCH 15/35] Enablement of onnxruntime for AIX and fixing issues
 related to big-endian platform. (#21133)

### Description
Enablement of onnxruntime for AIX and fixing issues related to
big-endian platform.

### Motivation and Context
changes in this PR contains:
1. Enablement code for building onnxruntime on AIX operating system.
2. while testing the build on AIX, we found issues related to big endian
platform . More details about few of those issues can be found in [Big
endian issue: Graph Transformation Attention Fusion tests are failing
#12921](https://github.com/microsoft/onnxruntime/issues/12921)

Below are list of files and the description about the change.
1.	cmake/CMakeLists.txt
[BUILDING on AIX issue] check for "IBMClang" is added for handling
-Wno-unused-parameter
2.	cmake/external/onnxruntime_external_deps.cmake
[BUILDING on AIX issue]Enabling gtest_disable_pthreads for AIX
3.	cmake/onnxruntime.cmake
[BUILDING on AIX issue]
o Blocking codes for AIX which generates generated_source.c and further
requires some symbol files.
o	Putting NO AIX check for non-supported linker flags like --Xlinker
o	iconv linking
4.	cmake/onnxruntime_framework.cmake
[BUILDING on AIX issue]Putting NO AIX check for -Wl,-rpath='$ORIGIN'
5.	cmake/onnxruntime_mlas.cmake
[BUILDING on AIX issue]POWER10 releated macro/function definition .
6.	cmake/onnxruntime_providers_cpu.cmake
[BUILDING on AIX issue]Putting NO AIX check for non-supported linker
flags like --Xlinker
7.	cmake/onnxruntime_unittests.cmake
[BUILDING on AIX issue]
o	Putting NO AIX check for non-supported linker flags like --Xlinker
o Adding required libraries for AIX linker under applicatiion like
onnxruntime_shared_lib_test ,onnxruntime_logging_apis etc
8.	cmake/patches/flatbuffers/flatbuffers.patch
[BUILDING on AIX issue] Handling of TypeCode in
include/flatbuffers/flatbuffers.h under AIX + clang
9.	onnxruntime/contrib_ops/cpu/murmur_hash3.cc
[Big endian issue] Byte-Conversion handlling in compute() and getblock()
routines
10.	onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
[Big endian issue] Handling of test failures . Byte swapping for
quant_value.
11.	onnxruntime/core/framework/tensorprotoutils.cc
[Big endian issue]
Implementation of SetRawDataInTensorProto , ConvertRawDataInTensorProto
.
o SetRawDataInTensorProto : Wrapper for set_raw_data(). Calling
ConvertRawDataInTensorProto() in big-endian system
o ConvertRawDataInTensorProto : function used mainly on big-endian
system for byte-swapping of tensor raw_data
12.	onnxruntime/core/framework/tensorprotoutils.h
[Big endian issue]
Declaration of SetRawDataInTensorProto,  ConvertRawDataInTensorProto
13.	onnxruntime/core/graph/graph.cc
[Big endian issue]
 o	Call ConvertRawDataInTensorProto for SPARSE_TENSOR type
 o	Call ConvertRawDataInTensorProto for SaveToOrtFormat
14.	onnxruntime/core/mlas/lib/platform.cpp
[BUILDING on AIX issue] POWER10 released enablement for AIX
15.	onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
[BUILDING on AIX issue]Handling of __vector under AIX+clang
16.	onnxruntime/core/mlas/lib/qgemm.h
[BUILDING on AIX issue] Adding _AIX flag
17.	onnxruntime/core/mlas/lib/qlmul.cpp
[BUILDING on AIX issue] Handling of __vector under AIX+clang
18.  onnxruntime/core/optimizer/attention_fusion.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
19.  onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
20.  onnxruntime/core/optimizer/constant_folding.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
21.  onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
22.  onnxruntime/core/optimizer/nchwc_transformer.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
23.  onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
24.  onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
25.  onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
26.
onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
27.  onnxruntime/core/optimizer/reshape_fusion.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
28.  onnxruntime/core/optimizer/stft_decomposition.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
29.
onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
30.	onnxruntime/core/platform/path_lib.h
[BUILDING on AIX issue] Moving to normal function call, instead of
template
31.	onnxruntime/core/platform/posix/env.cc
[BUILDING on AIX issue]Blocking syscall.h in AIX
32.	onnxruntime/core/session/inference_session.cc
[Big endian issue] Removing ORT_RETURN_IF_NOT, FLATBUFFERS_LITTLEENDIAN
33.	onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
[Big endian issue] Call ConvertRawDataInTensorProto in CreateInitializer
and ExternalWriteReadWithLoadInitializers
34.	onnxruntime/test/framework/sparse_kernels_test.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
35.	onnxruntime/test/framework/tensorutils_test.cc
[Big endian issue] Helper method ConvertEndianessForVector and call this
from required place.
36.	onnxruntime/test/framework/test_tensor_loader.cc
o.  [BUILDING on AIX issue] Handling of getcwd for AIX
o.  [Big endian issue]  Bytes Swapping in run_external_data_test
37.	onnxruntime/test/onnx/main.cc
[Big endian issue] including <thread> for AIX
38.	onnxruntime/test/onnx/tensorprotoutils.cc
[Big endian issue]  Bytes swapping in UnpackTensorWithRawData
39.	onnxruntime/test/optimizer/graph_transform_test.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
40.	onnxruntime/test/optimizer/graph_transform_test_builder.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
41.	onnxruntime/test/optimizer/graph_transform_test_builder.h
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
42.	onnxruntime/test/optimizer/initializer_test.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
43.	onnxruntime/test/optimizer/nchwc_optimizer_test.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
44.	onnxruntime/test/providers/base_tester.cc
[Big endian issue] Use util function SetRawDataInTensorProto, instead of
set_raw_data
45.	onnxruntime/test/providers/cpu/generator/random_test.cc
[BUILDING on AIX issue]  Adding AIX check in MultinomialGoodCase

---------

Co-authored-by: Vamshikrishna Thatikonda <vamshikrishna@in.ibm.com>
---
 cmake/CMakeLists.txt                          |   2 +-
 .../external/onnxruntime_external_deps.cmake  |   3 +
 cmake/onnxruntime.cmake                       |  43 +++++--
 cmake/onnxruntime_framework.cmake             |   2 +-
 cmake/onnxruntime_mlas.cmake                  |  14 ++-
 cmake/onnxruntime_providers_cpu.cmake         |   4 +-
 cmake/onnxruntime_unittests.cmake             |  25 +++-
 cmake/patches/flatbuffers/flatbuffers.patch   |  18 +++
 onnxruntime/contrib_ops/cpu/murmur_hash3.cc   |  62 ++++++++--
 .../cpu/quantization/matmul_nbits_impl.cc     |   7 ++
 .../core/framework/tensorprotoutils.cc        | 107 ++++++++++++++++--
 onnxruntime/core/framework/tensorprotoutils.h |  41 +++++++
 onnxruntime/core/graph/graph.cc               |  15 +++
 onnxruntime/core/mlas/lib/platform.cpp        |  18 ++-
 .../mlas/lib/power/qgemm_kernel_power10.cpp   |   8 ++
 onnxruntime/core/mlas/lib/qgemm.h             |   2 +-
 onnxruntime/core/mlas/lib/qlmul.cpp           |  12 +-
 .../core/optimizer/attention_fusion.cc        |   4 +-
 .../compute_optimizer/shared_utils.cc         |   2 +-
 .../core/optimizer/constant_folding.cc        |   3 +-
 .../core/optimizer/embed_layer_norm_fusion.cc |   6 +-
 .../core/optimizer/nchwc_transformer.cc       |  12 +-
 .../qdq_transformer/avx2_weight_s8_to_u8.cc   |   4 +-
 .../optimizer/qdq_transformer/qdq_s8_to_u8.cc |   2 +-
 .../core/optimizer/qdq_transformer/s8_to_u8.h |   4 +-
 .../selectors_actions/qdq_actions.cc          |   6 +-
 onnxruntime/core/optimizer/reshape_fusion.cc  |   2 +-
 .../core/optimizer/stft_decomposition.cc      |   2 +-
 .../ort_optimizer_api_impl.cc                 |   2 +-
 onnxruntime/core/platform/path_lib.h          |   2 +-
 onnxruntime/core/platform/posix/env.cc        |   2 +
 onnxruntime/core/session/inference_session.cc |   4 -
 .../test/flatbuffers/flatbuffer_utils_test.cc |   8 +-
 .../test/framework/sparse_kernels_test.cc     |   5 +-
 .../test/framework/tensorutils_test.cc        |  25 +++-
 .../test/framework/test_tensor_loader.cc      |  16 +++
 onnxruntime/test/onnx/main.cc                 |   2 +
 onnxruntime/test/onnx/tensorprotoutils.cc     |  18 ++-
 .../test/optimizer/graph_transform_test.cc    |   2 +-
 .../optimizer/graph_transform_test_builder.cc |   2 +-
 .../optimizer/graph_transform_test_builder.h  |   3 +-
 .../test/optimizer/initializer_test.cc        |   2 +-
 .../test/optimizer/nchwc_optimizer_test.cc    |   3 +-
 onnxruntime/test/providers/base_tester.cc     |   2 +-
 .../providers/cpu/generator/random_test.cc    |   4 +-
 45 files changed, 444 insertions(+), 88 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 5db0466497ad1..5555fa692eae8 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1040,7 +1040,7 @@ function(onnxruntime_set_compile_flags target_name)
       # Enable warning
       target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wall>")
       target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wextra>")
-      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang")
         #external/protobuf/src/google/protobuf/arena.h:445:18: error: unused parameter 'p'
         target_compile_options(${target_name} PRIVATE "-Wno-unused-parameter")
       endif()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 775576a771529..14e6ed515fd6e 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -46,6 +46,9 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(gtest_disable_pthreads ON)
   endif()
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE)
+  endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
   if (IOS OR ANDROID)
     # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index aebf9b53f8f05..21ae0947f3788 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -57,6 +57,7 @@ foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
   list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt")
 endforeach()
 
+if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
 add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
   COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py"
     --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}"
@@ -66,6 +67,7 @@ add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_s
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+endif()
 if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
@@ -98,13 +100,21 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
     # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
   )
 else()
-  onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc)
+  else()
+    onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
+  endif()
   if (onnxruntime_USE_CUDA)
     set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
   endif()
 endif()
 
-add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES})
+else()
+  add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+endif()
 target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
 
@@ -113,7 +123,7 @@ target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
 if(UNIX)
   if (APPLE)
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker -dead_strip")
-  else()
+  elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker --version-script=${SYMBOL_FILE} -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
   endif()
 else()
@@ -132,7 +142,7 @@ if (NOT WIN32)
     else()
         set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path")
     endif()
-  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
   endif()
 endif()
@@ -200,6 +210,10 @@ set(onnxruntime_INTERNAL_LIBRARIES
   onnxruntime_flatbuffers
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  list(APPEND onnxruntime_INTERNAL_LIBRARIES  iconv)
+endif()
+
 if (onnxruntime_USE_EXTENSIONS)
   list(APPEND onnxruntime_INTERNAL_LIBRARIES
     onnxruntime_extensions
@@ -216,15 +230,22 @@ target_link_libraries(onnxruntime PRIVATE
 )
 
 set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
-
 #See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html
 if(NOT APPLE AND NOT WIN32)
-  set_target_properties(onnxruntime PROPERTIES
-    PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
-    LINK_DEPENDS ${SYMBOL_FILE}
-    VERSION ${ORT_VERSION}
-    SOVERSION 1
-    FOLDER "ONNXRuntime")
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  else()
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      LINK_DEPENDS ${SYMBOL_FILE}
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  endif()
 else()
   # Omit the SOVERSION setting in Windows/macOS/iOS/.. build
   set_target_properties(onnxruntime PROPERTIES
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index c9bf2ac5c3dc6..43d16abd8fbae 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -108,7 +108,7 @@ add_dependencies(onnxruntime_framework ${onnxruntime_EXTERNAL_DEPENDENCIES})
 # For the shared onnxruntime library, this is set in onnxruntime.cmake through CMAKE_SHARED_LINKER_FLAGS
 # But our test files don't use the shared library so this must be set for them.
 # For Win32 it generates an absolute path for shared providers based on the location of the executable/onnxruntime.dll
-if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
 endif()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index df6553e383620..66f4aea606ef5 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -427,12 +427,24 @@ else()
           )
           if(COMPILES_P10)
             check_cxx_source_compiles("
+              #ifdef _AIX
+              #define POWER_10       0x40000
+              #define POWER_10_ANDUP (POWER_10)
+              #include <sys/systemcfg.h>
+              #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+              int main() {
+                bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
+                return 0;
+              }
+              #else
               #include <sys/auxv.h>
               int main() {
                 unsigned long hwcap2 = getauxval(AT_HWCAP2);
                 bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
                 return 0;
-              }"
+              }
+              }
+              #endif"
               HAS_P10_RUNTIME
             )
             if (HAS_P10_RUNTIME)
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index d9fba6f564037..d2afe19f36691 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -247,7 +247,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
-  set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
   set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 711a9f77f9094..0159c35d1941b 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1225,6 +1225,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       if (CMAKE_SYSTEM_NAME STREQUAL "Android")
         list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
       endif()
+      if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+        list(APPEND onnxruntime_perf_test_libs onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 gtest absl_failure_signal_handler absl_examine_stack absl_flags_parse  absl_flags_usage absl_flags_usage_internal)
+    endif()
       target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
       if(WIN32)
         target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
@@ -1275,6 +1278,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
     endif()
 
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2)
+    endif()
+
     AddTest(DYN
             TARGET onnxruntime_shared_lib_test
             SOURCES ${onnxruntime_shared_lib_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1510,7 +1517,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.lds -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
     endif()
   else()
@@ -1574,6 +1581,9 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+    endif()
     AddTest(DYN
             TARGET onnxruntime_customopregistration_test
             SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1608,7 +1618,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_invalid_library/custom_op_library.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1639,7 +1649,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_get_const_input_test_library/custom_op_lib.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1671,7 +1681,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1690,6 +1700,9 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
        ${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc)
 
   set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils)
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_framework onnxruntime_common onnxruntime_graph  onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+     endif()
 
   if(NOT WIN32)
     list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS})
@@ -1753,7 +1766,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/exported_symbols.lst")
   elseif(UNIX)
-    set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+     endif()
   elseif(WIN32)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/symbols.def")
   else()
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index fbe8db37ecb0e..9fb58e301bba8 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -10,3 +10,21 @@ index 3987eac9..5e5462f1 100644
 +  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
  endif()
  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h
+index bc828a31..3d3effe8 100644
+--- a/include/flatbuffers/flatbuffers.h
++++ b/include/flatbuffers/flatbuffers.h
+@@ -213,7 +213,12 @@ inline const char * const *ElementaryTypeNames() {
+ // We're explicitly defining the signedness since the signedness of integer
+ // bitfields is otherwise implementation-defined and causes warnings on older
+ // GCC compilers.
+-struct TypeCode {
++
++struct
++#if defined(_AIX) && defined(__clang__)
++__attribute__((packed))
++#endif
++TypeCode {
+   // ElementaryType
+   unsigned short base_type : 4;
+   // Either vector (in table) or array (in struct)
diff --git a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
index ec504d215920f..000c590f32616 100644
--- a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
+++ b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
@@ -8,6 +8,8 @@
 /* Modifications Copyright (c) Microsoft. */
 
 #include "contrib_ops/cpu/murmur_hash3.h"
+#include <memory>
+#include <utility>
 
 // Platform-specific functions and macros
 
@@ -60,11 +62,31 @@ inline uint64_t rotl64(uint64_t x, int8_t r) {
 // handle aligned reads, do the conversion here
 
 FORCE_INLINE uint32_t getblock(const uint32_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint32_t)c[0] |
+           (uint32_t)c[1] << 8 |
+           (uint32_t)c[2] << 16 |
+           (uint32_t)c[3] << 24;
+  }
 }
 
 FORCE_INLINE uint64_t getblock(const uint64_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint64_t)c[0] |
+           (uint64_t)c[1] << 8 |
+           (uint64_t)c[2] << 16 |
+           (uint64_t)c[3] << 24 |
+           (uint64_t)c[4] << 32 |
+           (uint64_t)c[5] << 40 |
+           (uint64_t)c[6] << 48 |
+           (uint64_t)c[7] << 56;
+  }
 }
 
 //-----------------------------------------------------------------------------
@@ -204,13 +226,35 @@ Status MurmurHash3::Compute(OpKernelContext* ctx) const {
     int input_num_bytes = static_cast<int>(input_element_bytes);
     ORT_ENFORCE(input_num_bytes % 4 == 0);
     const auto input_end = input + input_count * input_num_bytes;
-    while (input != input_end) {
-      MurmurHash3_x86_32(input,
-                         input_num_bytes,
-                         seed_,
-                         output);
-      input += input_num_bytes;
-      ++output;
+
+    if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+      while (input != input_end) {
+        MurmurHash3_x86_32(input,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
+    } else {
+      // Big endian platform require byte swapping.
+      auto raw_data = std::make_unique<char[]>(input_num_bytes);
+      char* raw_data_ptr = raw_data.get();
+      while (input != input_end) {
+        memcpy(raw_data_ptr, input, input_num_bytes);
+        char* start_byte = raw_data_ptr;
+        char* end_byte = start_byte + input_num_bytes - 1;
+        for (size_t count = 0; count < static_cast<size_t>(input_num_bytes / 2); ++count) {
+          std::swap(*start_byte++, *end_byte--);
+        }
+
+        MurmurHash3_x86_32(raw_data_ptr,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
     }
   }
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
index 7e343d85f4048..b28f3758f89b5 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -40,6 +40,13 @@ void Dequantize4BitsKernelReOrder(
   }
   T* output_i = output + out_y * out_cols + out_x;
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::big) {
+    const uint8_t* c = (const uint8_t*)(&quant_value);
+    quant_value = (uint32_t)c[0] |
+                  (uint32_t)c[1] << 8 |
+                  (uint32_t)c[2] << 16 |
+                  (uint32_t)c[3] << 24;
+  }
   const int remain_x = std::min(8, out_cols - out_x);
   const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1));
   for (int i = 0; i < remain_x; i++) {
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index e8086877a9159..4ecd61962d797 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <string>
 #include <filesystem>
 #if defined(__wasm__)
 #include <emscripten.h>
@@ -260,7 +261,89 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo
 
 namespace utils {
 
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param) {
+  tensor_proto.set_raw_data(std::move(param));
+}
+
+void ConvertRawDataInTensorProto(TensorProto* tensor) {
+  size_t element_size = 1;
+  char* bytes = NULL;
+  size_t num_elements = 0;
+  switch (tensor->data_type()) {
+    case TensorProto_DataType_FLOAT:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+
+    case TensorProto_DataType_INT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(int32_t);
+      break;
+
+    case TensorProto_DataType_UINT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint32_t);
+      break;
+
+    case TensorProto_DataType_UINT8:
+    case TensorProto_DataType_INT8:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint8_t);
+      break;
+
+    case TensorProto_DataType_UINT16:
+    case TensorProto_DataType_INT16:
+    case TensorProto_DataType_FLOAT16:
+    case TensorProto_DataType_BFLOAT16:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint16_t);
+      break;
+
+    case TensorProto_DataType_UINT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_uint64_data()->mutable_data());
+      num_elements = tensor->uint64_data_size();
+      element_size = sizeof(uint64_t);
+      break;
+
+    case TensorProto_DataType_DOUBLE:
+      bytes = reinterpret_cast<char*>(tensor->mutable_double_data()->mutable_data());
+      num_elements = tensor->double_data_size();
+      element_size = sizeof(double);
+      break;
+
+    case TensorProto_DataType_INT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int64_data()->mutable_data());
+      num_elements = tensor->int64_data_size();
+      element_size = sizeof(int64_t);
+      break;
+
+    case TensorProto_DataType_COMPLEX64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+  }
+  if (tensor->has_raw_data()) {
+    num_elements = (tensor->raw_data().size()) / element_size;
+    bytes = const_cast<char*>(tensor->mutable_raw_data()->c_str());
+  }
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+  return;
+}
+
 #if !defined(ORT_MINIMAL_BUILD)
+
 static Status UnpackTensorWithExternalDataImpl(const ONNX_NAMESPACE::TensorProto& tensor,
                                                const std::filesystem::path& tensor_proto_dir,
                                                size_t expected_num_elements, size_t element_size,
@@ -1159,11 +1242,6 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
 }
 
 ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
-  // Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
-  if constexpr (endian::native != endian::little) {
-    ORT_THROW("Big endian not supported");
-  }
-
   // Set name, dimensions, type, and data of the TensorProto.
   ONNX_NAMESPACE::TensorProto tensor_proto;
 
@@ -1182,7 +1260,7 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std:
       *mutable_string_data->Add() = *f;
     }
   } else {
-    tensor_proto.set_raw_data(tensor.DataRaw(), tensor.SizeInBytes());
+    utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), tensor.SizeInBytes());
   }
 
   return tensor_proto;
@@ -1464,8 +1542,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 
       ORT_RETURN_IF_ERROR(status);
     }
-    dense.set_raw_data(std::move(dense_data_storage));
-
+    utils::SetRawDataInTensorProto(dense, std::move(dense_data_storage));
   } else {
     // No request for std::string
     status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ",
@@ -1510,7 +1587,17 @@ static void SetIndices(gsl::span<int64_t> gathered_indices, std::string& raw_ind
     } else {
       auto* dst = ind_dest + dest_index;
       T v = static_cast<T>(src_index);
-      memcpy(dst, &v, sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        auto src = gsl::make_span<const unsigned char>(static_cast<const unsigned char*>(
+                                                           reinterpret_cast<const unsigned char*>(&v)),
+                                                       sizeof(T));
+        auto dest = gsl::make_span<unsigned char>(static_cast<unsigned char*>(
+                                                      reinterpret_cast<unsigned char*>(dst)),
+                                                  sizeof(T));
+        onnxruntime::utils::SwapByteOrderCopy(sizeof(T), src, dest);
+      } else {
+        memcpy(dst, &v, sizeof(T));
+      }
     }
     ++dest_index;
   }
@@ -1561,7 +1648,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
     }
   } else {
     indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    indices.set_raw_data(std::string());
+    utils::SetRawDataInTensorProto(indices, std::string());
   }
   nnz = gathered_indices.size();
 }
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index a66caf1ace33b..aabfc0487f3e0 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -5,6 +5,7 @@
 
 #include <vector>
 #include <type_traits>
+#include <string>
 #include <filesystem>
 
 #ifndef SHARED_PROVIDER
@@ -19,6 +20,46 @@
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
 
+namespace onnxruntime {
+namespace utils {
+/**
+ * This function is used to convert the endianess of Tensor data.
+ * Mostly, will be used in big endian system to support the model file
+ * generated on little endian system.
+ * @param initializer       given initializer tensor
+ * @returns                 None
+ */
+void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* initializer);
+
+/**
+ * Wrapper function for set_raw_data.
+ * First calls the set_raw_data and then calls ConvertRawDataInTensorProto
+ * under big endian system.
+ * @param tensor_proto given initializer tensor
+ * @param raw_data     source raw_data pointer
+ * @param raw_data_len  length of raw_data
+ * @returns                 None
+ */
+template <typename T1, typename T2>
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_data, T2 raw_data_len) {
+  using namespace ONNX_NAMESPACE;
+  tensor_proto.set_raw_data(raw_data, raw_data_len);
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&tensor_proto);
+  }
+}
+
+/**
+ * Overload Wrapper function for set_raw_data handling string object.
+ * Forward the string object to set_raw_data.
+ * @param tensor_proto given initializer tensor
+ * @param param   string object reference
+ * @returns                 None
+ */
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param);
+}  // namespace utils
+}  // namespace onnxruntime
+
 namespace ONNX_NAMESPACE {
 class TensorProto;
 class TensorShapeProto;
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index f73a50db7aaa4..442a0db933d65 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1199,6 +1199,15 @@ Graph::Graph(const Model& owning_model,
 
     const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
     auto status = utils::ConstantNodeProtoToTensorProto(node, model_path, *tensor);
+    if constexpr (endian::native != endian::little) {
+      const AttributeProto& attrib = node.attribute(0);
+      if (attrib.type() == AttributeProto_AttributeType_SPARSE_TENSOR) {
+        const TensorProto& sparse_values = node.attribute(0).sparse_tensor().values();
+        if ((!(sparse_values.has_raw_data())) && tensor->has_raw_data()) {
+          onnxruntime::utils::ConvertRawDataInTensorProto(tensor);
+        }
+      }
+    }
     ORT_ENFORCE(status.IsOK(), status.ToString());
     // Ensure initializers are also graph inputs.
     if (ir_version_ < 4) {
@@ -3716,6 +3725,12 @@ SaveInputsOutputsToOrtFormat(flatbuffers::FlatBufferBuilder& builder, const std:
 
 common::Status Graph::SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                       flatbuffers::Offset<fbs::Graph>& fbs_graph) const {
+  if constexpr (endian::native != endian::little) {
+    auto& tens = GetAllInitializedTensors();
+    for (auto& [name, tensor_p] : tens) {
+      utils::ConvertRawDataInTensorProto(const_cast<TensorProto*>(tensor_p));
+    }
+  }
   auto inputs = SaveInputsOutputsToOrtFormat(builder, graph_inputs_including_initializers_);
   auto outputs = SaveInputsOutputsToOrtFormat(builder, graph_outputs_);
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 72eb35c894094..859b7c2f560a4 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -20,8 +20,15 @@ Module Name:
 #include <thread>
 #include <mutex>
 
-#if defined(MLAS_TARGET_POWER) && defined(__linux__)
+#if defined(MLAS_TARGET_POWER) 
+#if defined(__linux__)
 #include <sys/auxv.h>
+#elif defined(_AIX)
+#define POWER_10       0x40000
+#define POWER_10_ANDUP (POWER_10)
+#include <sys/systemcfg.h>
+#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+#endif
 #endif
 
 #if defined(MLAS_TARGET_ARM64)
@@ -554,6 +561,9 @@ Return Value:
     unsigned long hwcap2 = getauxval(AT_HWCAP2);
 
     bool HasP9Instructions = hwcap2 & PPC_FEATURE2_ARCH_3_00;
+#elif defined(_AIX)
+    bool HasP9Instructions = __power_9_andup();
+#endif // __linux__
     if (HasP9Instructions) {
         this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelVSX;
         this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelVSX;
@@ -562,7 +572,11 @@ Return Value:
 #if defined(POWER10)
 #if (defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12))
+#if defined(__linux__)
     bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
+#elif defined(_AIX)
+    bool HasP10Instructions = (__power_10_andup() && __power_mma_version() == MMA_V31);
+#endif // __linux__
     if (HasP10Instructions) {
         this->GemmFloatKernel = MlasSgemmKernelPOWER10;
         this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
@@ -571,7 +585,6 @@ Return Value:
 #endif
 #endif
 
-#endif // __linux__
 #endif // MLAS_TARGET_POWER
 
 #if defined(MLAS_TARGET_LARCH64)
@@ -676,7 +689,6 @@ MlasPlatformU8S8Overflow(
 }
 
 #endif
-
 thread_local size_t ThreadedBufSize = 0;
 #ifdef _MSC_VER
 thread_local std::unique_ptr<uint8_t, decltype(&_aligned_free)> ThreadedBufHolder(nullptr, &_aligned_free);
diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
index a67be1dbfa710..0f3bc1d579711 100644
--- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
+++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
@@ -874,10 +874,18 @@ MlasQgemmStoreVectorMMA
 {
     size_t RowCount;
     __vector signed int vsum0, vsum1, vsum2, vsum3;
+#if defined(_AIX) && defined(__clang__)
+    __vector signed int columnsum = *reinterpret_cast<const __vector int *>(&ColumnSumBuffer[pos]);
+#else
     __vector signed int columnsum = *reinterpret_cast<const __vector int32_t *>(&ColumnSumBuffer[pos]);
+#endif
     C += VectorCount;
     if (ZeroPointB != nullptr) {
+#if defined(_AIX) && defined(__clang__)
+        __vector signed int zeropoint = *reinterpret_cast<const __vector int *>(&ZeroPointB[pos]);
+#else
         __vector signed int zeropoint = *reinterpret_cast<const __vector int32_t *>(&ZeroPointB[pos]);
+#endif
         if (ZeroMode) {
             for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
                 vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum;
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 75c17a6b5a177..127aea9029b65 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -894,7 +894,7 @@ MlasGemmQuantGetDispatch(
     if (!AIsSigned) {
         GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
     }
-#elif defined(MLAS_TARGET_POWER) && defined(__linux__)  && defined(POWER10) && \
+#elif defined(MLAS_TARGET_POWER) && (defined(__linux__)  || defined(_AIX)) && defined(POWER10) && \
     ((defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12)))
     if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) {
diff --git a/onnxruntime/core/mlas/lib/qlmul.cpp b/onnxruntime/core/mlas/lib/qlmul.cpp
index 38818e1190d21..4a6d57db0d211 100644
--- a/onnxruntime/core/mlas/lib/qlmul.cpp
+++ b/onnxruntime/core/mlas/lib/qlmul.cpp
@@ -325,12 +325,20 @@ MlasQLinearMulKernel(
     }
 
     while (N >= 4) {
-        __vector int32_t IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#if defined(_AIX) && defined(__clang__)
+        __vector int IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#else
+        __vector int32_t  IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#endif
         auto IntegerVector = vec_sub(IntegerAVector, ZeroPointAVector);
         auto ValueAVector = vec_mul(ScaleAVector, vec_ctf(IntegerVector, 0));
 
         if (!IsScalarB) {
-            __vector int32_t IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#if defined(_AIX) && defined(__clang__)
+            __vector int  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#else
+            __vector int32_t  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#endif
             IntegerVector = vec_sub(IntegerBVector, ZeroPointBVector);
             ValueBVector = vec_mul(ScaleBVector, vec_ctf(IntegerVector, 0));
         }
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index b88f2d6a4637e..08066f030a381 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -126,7 +126,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<float>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* q_weight = q_initializer.data<MLFloat16>();
     const MLFloat16* k_weight = k_initializer.data<MLFloat16>();
@@ -138,7 +138,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   return graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
index 913f3b6811183..86a7a4d6afbf8 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
@@ -188,7 +188,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph,
               "The total count of dims does not match the size of values. ",
               "total_count: ", total_count, " values.size(): ", values.size());
 
-  const_tensor.set_raw_data(values.data(), values.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t));
   return &graph_utils::AddInitializer(graph, const_tensor);
 }
 
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index 9df300d6f4f88..1466de51d0b99 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -82,8 +82,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) {
     shape_constant.set_name(constant_arg_out->Name());
     shape_constant.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
     shape_constant.add_dims(clamped_slice_length);
-    shape_constant.set_raw_data(dim_values.data() + start,
-                                clamped_slice_length * sizeof(int64_t));
+    utils::SetRawDataInTensorProto(shape_constant, dim_values.data() + start, clamped_slice_length * sizeof(int64_t));
     ONNX_NAMESPACE::TensorShapeProto result_shape;
     result_shape.add_dim()->set_dim_value(clamped_slice_length);
     constant_arg_out->SetShape(result_shape);
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 7b6f829b7a0a4..e8e395678436e 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -465,15 +465,13 @@ static NodeArg* ExtractEmbedding(Graph& graph,
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* data = old_initializer.data<MLFloat16>();
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc
index 2b29473f876c3..46f306b92bed5 100644
--- a/onnxruntime/core/optimizer/nchwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nchwc_transformer.cc
@@ -428,7 +428,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
     nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
     nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-    nchwc_conv_W_tensor_proto.set_raw_data(reordered_filter.data(), reordered_filter.size() * sizeof(float));
+    utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, reordered_filter.data(),
+                                   reordered_filter.size() * sizeof(float));
 
     nchwc_conv_W_tensor_proto.add_dims(nchwc_output_channels);
     nchwc_conv_W_tensor_proto.add_dims(filter_input_channels);
@@ -458,7 +459,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
       nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
       nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-      nchwc_conv_B_tensor_proto.set_raw_data(aligned_bias.data(), gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
+      utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, aligned_bias.data(),
+                                     gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
 
       nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels);
 
@@ -883,7 +885,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_W_tensor_proto;
   nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_scale"));
-  nchwc_conv_W_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_W_tensor_proto.add_dims(nchwc_channels);
   nchwc_conv_W_tensor_proto.add_dims(1);
   nchwc_conv_W_tensor_proto.add_dims(1);
@@ -896,7 +899,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_B_tensor_proto;
   nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_B"));
-  nchwc_conv_B_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_B_tensor_proto.add_dims(nchwc_channels);
 
   auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
index 6f0f38b1de56e..18e462c04dff3 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
@@ -129,7 +129,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   weights_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8");
   weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims());
-  weights_proto_u8.set_raw_data(w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
+  utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
   input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8);
 
   ONNX_NAMESPACE::TensorProto weight_zp_proto_u8;
@@ -140,7 +140,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8");
   r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims());
-  r_proto_u8.set_raw_data(r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
+  utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
   input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8);
 
   ONNX_NAMESPACE::TensorProto r_zp_proto_u8;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
index 199fbffc9f723..f2033dcbc1b03 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
@@ -60,7 +60,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) {
   ONNX_NAMESPACE::TensorProto zp_tensor_proto_u8;
   zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion"));
-  zp_tensor_proto_u8.set_raw_data(&q_zp_value, sizeof(uint8_t));
+  utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t));
   NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8);
 
   auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant");
diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
index 6caa35ea61ed7..1c1341fe5a127 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
@@ -27,7 +27,7 @@ inline bool Int8TensorProto2Uint8(
   if (nullptr == src) {
     uint8_t zero_val = 128;
     dst.set_name(graph.GenerateNodeArgName("weight_zp_s8_2_u8"));
-    dst.set_raw_data(&zero_val, sizeof(uint8_t));
+    utils::SetRawDataInTensorProto(dst, &zero_val, sizeof(uint8_t));
     return true;
   }
 
@@ -58,7 +58,7 @@ inline bool Int8TensorProto2Uint8(
     p++;
   }
   if (force || should_convert) {
-    dst.set_raw_data(temp.data<int8_t>(), size_t(temp.size()));
+    utils::SetRawDataInTensorProto(dst, temp.data<int8_t>(), size_t(temp.size()));
     return true;
   }
   return false;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 3d2a81ce7f8cd..3497ea4c85523 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -5,6 +5,7 @@
 
 #include "core/optimizer/qdq_transformer/qdq_util.h"
 #include "core/graph/node_attr_utils.h"
+#include "core/framework/tensorprotoutils.h"
 namespace onnxruntime {
 namespace QDQ {
 
@@ -132,7 +133,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(int8_t));
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(int8_t));
 
     return tensor_proto;
   };
@@ -145,8 +146,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(uint8_t));
-
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(uint8_t));
     return tensor_proto;
   };
   static ONNX_NAMESPACE::TensorProto GetOptionalZeroPointInt8() {
diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc
index 7768a835d5042..7f94e18458be2 100644
--- a/onnxruntime/core/optimizer/reshape_fusion.cc
+++ b/onnxruntime/core/optimizer/reshape_fusion.cc
@@ -435,7 +435,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo
   shape_initializer_proto.set_name(shape_def->Name());
   shape_initializer_proto.add_dims(static_cast<int64_t>(shape_value.size()));
   shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  shape_initializer_proto.set_raw_data(shape_value.data(), shape_value.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t));
   auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto);
 
   // Safely remove concat parent nodes which have only one output
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
index a54904ff15e1e..5c09e5225ab9c 100644
--- a/onnxruntime/core/optimizer/stft_decomposition.cc
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -45,7 +45,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T
     element_count *= shape[i];
     proto.add_dims(shape[i]);
   }
-  proto.set_raw_data(begin, element_count * sizeof(TDataType));
+  utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType));
   return &graph_utils::AddInitializer(graph, proto);
 }
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index 1f7e54cb807ea..f756d01413eae 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -766,10 +766,10 @@ std::string_view ApiGraph::AddInitializer(api::DataType dtype, const std::vector
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_data_type(gsl::narrow_cast<int32_t>(dtype));
   tensor_proto.set_name(name);
-  tensor_proto.set_raw_data(data.data(), data.size());
   for (int64_t dim : shape) {
     tensor_proto.add_dims(dim);
   }
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size());
 
   const auto& node_arg = graph_utils::AddInitializer(graph_, tensor_proto);
   return node_arg.Name();
diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h
index a9d89f32e91d3..fca8990f14821 100644
--- a/onnxruntime/core/platform/path_lib.h
+++ b/onnxruntime/core/platform/path_lib.h
@@ -281,7 +281,7 @@ void LoopDir(const std::string& dir_name, T func) {
   ORT_TRY {
     struct dirent* dp;
     while ((dp = readdir(dir)) != nullptr) {
-      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent<PATH_CHAR_TYPE>(dir_name, dp->d_name);
+      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent(dir_name, dp->d_name);
       if (stat(filename.c_str(), &stats) != 0) {
         continue;
       }
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index ec06320438977..04cf5ff6a3329 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#if !defined(_AIX)
 #include <sys/syscall.h>
+#endif
 #include <unistd.h>
 
 #include <iostream>
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 3ef6490a56ded..f0eed91d70440 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -881,8 +881,6 @@ common::Status InferenceSession::RegisterGraphTransformer(
 }
 
 common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& filepath) const {
-  ORT_RETURN_IF_NOT(FLATBUFFERS_LITTLEENDIAN, "ort format only supports little-endian machines");
-
   // Get the byte size of the ModelProto and round it to the next MB and use it as flatbuffers' init_size
   // TODO: Investigate whether we should set a max size, and clarify the cost of having a buffer smaller than
   // what the total flatbuffers serialized size will be.
@@ -1390,8 +1388,6 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len
 }
 
 Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort_format_model_bytes) {
-  static_assert(FLATBUFFERS_LITTLEENDIAN, "ORT format only supports little-endian machines");
-
   std::lock_guard<onnxruntime::OrtMutex> l(session_mutex_);
 
   if (is_model_loaded_) {  // already loaded
diff --git a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
index 32f2da806be3b..467c5e773589a 100644
--- a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
+++ b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
@@ -12,7 +12,6 @@
 #include "core/graph/graph_flatbuffers_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
-
 #include "test/flatbuffers/flatbuffers_utils_test.fbs.h"
 
 #include "test/util/include/asserts.h"
@@ -116,6 +115,10 @@ ONNX_NAMESPACE::TensorProto CreateInitializer(const std::string& name,
       ORT_THROW("Unsupported data type: ", data_type);
   }
 
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto(&tp);
+  }
+
   return tp;
 }
 
@@ -258,6 +261,9 @@ TEST(FlatbufferUtilsTest, ExternalWriteReadWithLoadInitializers) {
   for (const auto* fbs_tensor : *fbs_tensors2) {
     ONNX_NAMESPACE::TensorProto initializer;
     ASSERT_STATUS_OK(LoadInitializerOrtFormat(*fbs_tensor, initializer, options, reader));
+    if constexpr (endian::native != endian::little) {
+      utils::ConvertRawDataInTensorProto(&initializer);
+    }
     loaded_initializers.emplace_back(std::move(initializer));
     // also check that the loaded flatbuffer tensors have accurately written to the external_data_offset field
     if (fbs_tensor->data_type() != fbs::TensorDataType::STRING && fbs_tensor->name()->str() != "tensor_32_small") {
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index fa42bb6e96cd5..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -705,6 +705,9 @@ struct InsertIndices {
       // Conversion on the fly to the target data type
       std::vector<T> indices(indices_data.cbegin(), indices_data.cend());
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices.data()), indices.size() * sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&indices_tp);
+      }
     }
   }
 };
@@ -837,7 +840,7 @@ static void TestConversion(
 template <typename T>
 static void RawDataWriter(const std::vector<T>& values, TensorProto& tp, TensorProto_DataType datatype) {
   tp.set_data_type(datatype);
-  tp.set_raw_data(values.data(), values.size() * sizeof(T));
+  utils::SetRawDataInTensorProto(tp, values.data(), values.size() * sizeof(T));
 }
 
 int64_t ActualSize(const TensorProto& actual) {
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 05bdb3a9a033d..6821f582ce2de 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -30,7 +30,7 @@ void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::pat
   for (int i = 0; i < 4; ++i) {
     memcpy(rawdata + i * sizeof(T), &(f[i]), sizeof(T));
   }
-  float_tensor_proto.set_raw_data(rawdata, len);
+  utils::SetRawDataInTensorProto(float_tensor_proto, rawdata, len);
   T float_data2[4];
   auto status = UnpackTensor(float_tensor_proto, model_path, float_data2, 4);
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
@@ -102,8 +102,25 @@ std::vector<BFloat16> CreateValues<BFloat16>() {
   return {BFloat16(0.f), BFloat16(1.f), BFloat16(2.f), BFloat16(3.f)};
 }
 
+template <typename T>
+void ConvertEndianessForVector(const std::vector<T>& test_data) {
+  const size_t element_size = sizeof(T);
+  const size_t num_elements = test_data.size();
+  char* bytes = reinterpret_cast<char*>(const_cast<T*>(test_data.data()));
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+}
+
 template <typename T>
 void WriteDataToFile(FILE* fp, const std::vector<T>& test_data) {
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(test_data);
+  }
   size_t size_in_bytes = test_data.size() * sizeof(T);
   ASSERT_EQ(size_in_bytes, fwrite(test_data.data(), 1, size_in_bytes, fp));
 }
@@ -147,6 +164,9 @@ void UnpackAndValidate(const TensorProto& tensor_proto, const std::filesystem::p
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tensor_proto, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
 
   // Validate data
   for (size_t i = 0; i < test_data.size(); i++) {
@@ -325,6 +345,9 @@ static void TestConstantNodeConversionWithExternalData(TensorProto_DataType type
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tp, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
   for (size_t i = 0; i < test_data.size(); i++) {
     ASSERT_EQ(val[i], test_data[i]);
   }
diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc
index 17edad73085c9..73bf351b6c556 100644
--- a/onnxruntime/test/framework/test_tensor_loader.cc
+++ b/onnxruntime/test/framework/test_tensor_loader.cc
@@ -104,6 +104,18 @@ static void run_external_data_test() {
   std::unique_ptr<ORTCHAR_T, decltype(&DeleteFileFromDisk)> file_deleter(const_cast<ORTCHAR_T*>(filename.c_str()),
                                                                          DeleteFileFromDisk);
   float test_data[] = {1.0f, 2.2f, 3.5f};
+  if constexpr (endian::native != endian::little) {
+    const int element_size = sizeof(float);
+    char* bytes = reinterpret_cast<char*>(test_data);
+    const size_t num_elements = std::size(test_data);
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
+  }
   ASSERT_EQ(sizeof(test_data), fwrite(test_data, 1, sizeof(test_data), fp));
   ASSERT_EQ(0, fclose(fp));
   // construct a tensor proto
@@ -128,8 +140,12 @@ static void run_external_data_test() {
     len = GetCurrentDirectoryW(len, (ORTCHAR_T*)cwd.data());
     ASSERT_NE(len, (DWORD)0);
     cwd.append(ORT_TSTR("\\fake.onnx"));
+#else
+#if defined(_AIX)
+    char* p = getcwd(nullptr, PATH_MAX);
 #else
     char* p = getcwd(nullptr, 0);
+#endif
     ASSERT_NE(p, nullptr);
     cwd = p;
     free(p);
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index fc29756a1ff98..9886d98dcc6d6 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -8,6 +8,8 @@
 #include <unordered_map>
 #ifdef _WIN32
 #include "getopt.h"
+#elif defined(_AIX)
+#include <thread>
 #else
 #include <getopt.h>
 #include <thread>
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 5df055f862a86..50ab2290c6456 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <utility>
 
 #include "mem_buffer.h"
 #include "core/common/safeint.h"
@@ -68,11 +69,22 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
     ORT_CXX_API_THROW(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ",
                                  expected_size_in_bytes, ", got ", raw_data_length),
                       OrtErrorCode::ORT_FAIL);
+  memcpy(p_data, raw_data, raw_data_length);
   if constexpr (endian::native != endian::little) {
-    ORT_CXX_API_THROW("UnpackTensorWithRawData only handles little-endian native byte order for now.",
-                      OrtErrorCode::ORT_NOT_IMPLEMENTED);
+    /* Convert Endianness */
+    char* bytes = reinterpret_cast<char*>(p_data);
+    size_t element_size = sizeof(T);
+    size_t num_elements = raw_data_length / element_size;
+
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      /* keep swapping */
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
   }
-  memcpy(p_data, raw_data, raw_data_length);
 }
 
 template <>
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 2bfa57a2ceb9e..3e4e845440117 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -4972,8 +4972,8 @@ TEST_F(GraphTransformationTests, CseWithConstantOfShape) {
     TensorProto value_tensor;
     value_tensor.add_dims(1);
     float value = 2.333f;
-    value_tensor.set_raw_data(reinterpret_cast<const char*>(&value), sizeof(float));
     value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    utils::SetRawDataInTensorProto(value_tensor, reinterpret_cast<const char*>(&value), sizeof(float));
     builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor);
     builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor);
     builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1});
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
index 73c8b3f119103..2cbfbbb317642 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
@@ -61,7 +61,7 @@ NodeArg* ModelTestBuilder::MakeInitializer(gsl::span<const int64_t> shape,
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_name(name);
   tensor_proto.set_data_type(elem_type);
-  tensor_proto.set_raw_data(raw_data.data(), raw_data.size());
+  utils::SetRawDataInTensorProto(tensor_proto, raw_data.data(), raw_data.size());
 
   for (auto& dim : shape) {
     tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 0282d09f340b2..6214094a26c4f 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -13,6 +13,7 @@
 #include "core/framework/int4.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/framework/test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
@@ -249,7 +250,7 @@ class ModelTestBuilder {
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<bool>());
     std::unique_ptr<bool[]> data_buffer = std::make_unique<bool[]>(data.size());
     for (size_t i = 0; i < data.size(); ++i) data_buffer[i] = data[i];
-    tensor_proto.set_raw_data(data_buffer.get(), data.size());
+    utils::SetRawDataInTensorProto(tensor_proto, data_buffer.get(), data.size());
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 522e96e762d5a..391942acfca35 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -163,8 +163,8 @@ void TestInitializerRawData() {
   tensor_proto.set_name("OptimizerInitializerTest_RawData");
   tensor_proto.add_dims(3);
   tensor_proto.add_dims(4);
-  tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
 
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
   const Initializer init(tensor_proto, std::filesystem::path());
 
   for (size_t idx = 0; idx < data.size(); idx++) {
diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
index 8e4edc9e0abbb..538f60040418c 100644
--- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
@@ -6,6 +6,7 @@
 #include "core/mlas/inc/mlas.h"
 #include "core/session/environment.h"
 #include "core/session/inference_session.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/compare_ortvalue.h"
 #include "test/test_environment.h"
 #include "test/framework/test_utils.h"
@@ -62,7 +63,7 @@ struct NchwcTestHelper {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<T>());
-    tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
+    utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 1db8616c85daa..01de15e6f8ec8 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -73,7 +73,7 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) {
       }
     } else {
       auto buffer_size = tensor.DataType()->Size() * shape.Size();
-      tensor_proto.set_raw_data(tensor.DataRaw(), buffer_size);
+      utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size);
     }
 
     // 4. name
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index be049d1cf0ce3..ec9b1614488a7 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -256,7 +256,7 @@ TEST(Random, MultinomialGoodCase) {
   const std::vector<int64_t> output_dims{batch_size, num_samples};
 #ifdef _WIN32
   const std::vector<int64_t> expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int64_t> expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
 #else
   const std::vector<int64_t> expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0};
@@ -294,7 +294,7 @@ TEST(Random, MultinomialDefaultDType) {
 #ifdef _WIN32
   const std::vector<int32_t> expected_output_1{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
   const std::vector<int32_t> expected_output_2{0, 0, 1, 0, 2, 2, 2, 0, 2, 1, 2, 1, 0, 2, 0, 2, 2, 1, 2, 1};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int32_t> expected_output_1{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
   const std::vector<int32_t> expected_output_2{1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 2, 2, 2, 1};
 #else

From 1b38c05544ed29bd7de4de79b93ffb4fe741ec56 Mon Sep 17 00:00:00 2001
From: kailums <109063327+kailums@users.noreply.github.com>
Date: Thu, 18 Jul 2024 14:50:01 +0800
Subject: [PATCH 16/35] change ci docker image to rocm6.1 (#21296)

### Description
<!-- Describe your changes. -->
There is a bug for kernel running on rocm6.0, so change ci docker image
to rocm6.1

For the torch installed in the docker image, change to rocm repo when it
is not 6.0 version.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 ...-mi200.huggingface.bert-large-rocm6.1.json | 57 +++++++++++++++++++
 .../linux-migraphx-ci-pipeline.yml            |  2 +-
 .../orttraining-pai-ci-pipeline.yml           |  2 +-
 .../pai/rocm-ci-pipeline-env.Dockerfile       |  8 +--
 4 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json

diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
new file mode 100644
index 0000000000000..05fcf08cd3232
--- /dev/null
+++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
@@ -0,0 +1,57 @@
+{
+    "steps": [
+        {
+            "step": 20,
+            "loss": 2.0136
+        },
+        {
+            "step": 40,
+            "loss": 1.8466
+        },
+        {
+            "step": 60,
+            "loss": 1.7525
+        },
+        {
+            "step": 80,
+            "loss": 1.6682
+        },
+        {
+            "step": 100,
+            "loss": 1.658
+        },
+        {
+            "step": 120,
+            "loss": 1.6749
+        },
+        {
+            "step": 140,
+            "loss": 1.6263
+        },
+        {
+            "step": 160,
+            "loss": 1.6828
+        },
+        {
+            "step": 180,
+            "loss": 1.6145
+        },
+        {
+            "step": 200,
+            "loss": 1.6197
+        },
+        {
+            "step": 220,
+            "loss": 1.6353
+        },
+        {
+            "step": 240,
+            "loss": 1.5266
+        },
+        {
+            "step": 260,
+            "loss": 1.5441
+        }
+    ],
+    "samples_per_second": 34.561
+}
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f36cd9cfbfca1..6bf6324252fb9 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -36,7 +36,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
 
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 001062452644e..0e1afdcc5b8ca 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -25,7 +25,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
   - name: BuildConfig
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index b94826ae0e4bc..bf21a65314985 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=6.0
+ARG ROCM_VERSION=6.1
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
@@ -77,11 +77,7 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
 RUN export MAJOR=$(cut -d '.' -f 1 <<< "$ROCM_VERSION") && \
     export MINOR=$(cut -d '.' -f 2 <<< "$ROCM_VERSION") && \
     export PATCH=$(cut -d '.' -f 3 <<< "$ROCM_VERSION") && \
-    if (( MAJOR >= 6 )); then \
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${MAJOR}.${MINOR} ; \
-    else \
-        pip install torch==2.0.1 torchvision==0.15.2 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ ; \
-    fi && \
+    pip install torch==2.1.2 torchvision==0.16.1 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ && \
     pip install torch-ort --no-dependencies
 
 ##### Install Cupy to decrease CPU utilization

From bb76ead96c92b789ce801519b731616baa42b656 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Thu, 18 Jul 2024 12:11:52 -0700
Subject: [PATCH 17/35] [TensorRT EP] support TensorRT 10.2-GA (#21395)

### Description
<!-- Describe your changes. -->
* promote trt version to 10.2.0.19
* EP_Perf CI: clean config of legacy TRT<8.6, promote test env to
trt10.2-cu118/cu125
* skip two tests as Float8/BF16 are supported by TRT>10.0 but TRT CIs
are not hardware-compatible on these:
 ```
 1: [  FAILED  ] 2 tests, listed below:
 1: [  FAILED  ] IsInfTest.test_isinf_bfloat16
 1: [  FAILED  ] IsInfTest.test_Float8E4M3FN
 ```

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../tools/tensorrt/perf/build/build_image.py  |  6 +-
 .../test/providers/cpu/tensor/isinf_test.cc   | 12 ++-
 .../azure-pipelines/bigmodels-ci-pipeline.yml |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 +-
 .../cuda-packaging-pipeline.yml               |  4 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  4 +-
 .../nuget/templates/test_linux.yml            |  2 +-
 .../azure-pipelines/post-merge-jobs.yml       |  2 +-
 .../py-package-test-pipeline.yml              |  2 +-
 .../jobs/py-linux-cuda-package-test-job.yml   |  4 +-
 .../nuget-linux-cuda-packaging-stage.yml      |  8 +-
 .../stages/py-cuda-packaging-stage.yml        |  8 +-
 .../jobs/download_win_gpu_library.yml         |  8 +-
 .../templates/jobs/set-winenv.yml             | 14 +--
 .../templates/py-linux-gpu.yml                |  6 +-
 .../py-packaging-linux-test-cuda.yml          |  6 +-
 .../py-packaging-selectable-stage.yml         |  2 +-
 .../templates/py-packaging-stage.yml          | 12 +--
 .../win-gpu-tensorrt-ci-pipeline.yml          |  4 +-
 .../Dockerfile.package_ubi8_cuda_tensorrt10_0 |  2 +-
 ...rfile.package_ubi8_cuda_tensorrt10_0_torch |  2 +-
 .../docker/Dockerfile.package_ubuntu_2004_gpu |  2 +-
 .../Dockerfile.package_ubuntu_2004_gpu_ffmpeg |  2 +-
 .../Dockerfile.ubuntu_cuda11_6_tensorrt8_4    | 63 -------------
 .../Dockerfile.ubuntu_cuda11_8_tensorrt8_5    | 92 -------------------
 ..._0 => Dockerfile.ubuntu_cuda11_tensorrt10} |  2 +-
 ..._0 => Dockerfile.ubuntu_cuda12_tensorrt10} | 24 +++--
 .../docker/Dockerfile.ubuntu_tensorrt_bin     |  5 +-
 .../inference/x86_64/python/cuda/Dockerfile   |  2 +-
 .../ci_build/github/windows/setup_env_gpu.bat |  4 +-
 .../ci_build/github/windows/setup_env_trt.bat |  2 +-
 31 files changed, 80 insertions(+), 230 deletions(-)
 delete mode 100644 tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
 delete mode 100644 tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
 rename tools/ci_build/github/linux/docker/{Dockerfile.ubuntu_cuda11_8_tensorrt10_0 => Dockerfile.ubuntu_cuda11_tensorrt10} (99%)
 rename tools/ci_build/github/linux/docker/{Dockerfile.ubuntu_cuda12_4_tensorrt10_0 => Dockerfile.ubuntu_cuda12_tensorrt10} (83%)

diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 9ee8f27df5c99..2f335009b59c6 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,12 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
-    "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
     "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
     "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0",
-    "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0",
+    "10.2.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.2.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index bd97306142f18..4fc2e6c7c909b 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -18,13 +18,17 @@ constexpr double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
 constexpr double DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
 
 template <typename T>
-void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output, bool skip_trt = false) {
   OpTester test("IsInf", opset);
   test.AddAttribute<int64_t>("detect_positive", detect_positive);
   test.AddAttribute<int64_t>("detect_negative", detect_negative);
   test.AddInput<T>("X", {onnxruntime::narrow<int64_t>(input.size())}, input);
   test.AddOutput<bool>("Y", {onnxruntime::narrow<int64_t>(output.size())}, output);
-  test.Run();
+  if (skip_trt) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
+    test.Run();
+  }
 }
 
 TEST(IsInfTest, test_isinf_float10) {
@@ -124,7 +128,7 @@ TEST(IsInfTest, test_isinf_bfloat16) {
   std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
                                            BFloat16::NegativeInfinity, BFloat16::Infinity};
   std::initializer_list<bool> output = {false, false, true, false, true, true};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_isinf_positive_bfloat16) {
@@ -146,7 +150,7 @@ TEST(IsInfTest, test_Float8E4M3FN) {
   std::initializer_list<Float8E4M3FN> input = {
       Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)};
   std::initializer_list<bool> output = {false, false, false, false, false, false};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10.1 supports Float8 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_Float8E4M3FNUZ) {
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 41b3c47ba0396..a66828ee5e188 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -43,7 +43,7 @@ variables:
   - name: docker_base_image
     value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
   - name: linux_trt_version
-    value: 10.0.1.6-1.cuda11.8
+    value: 10.2.0.19-1.cuda11.8
   - name: Repository
     value: 'onnxruntimecuda11manylinuxbuild'
 
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 8b386dde7d3a7..700326fe9173c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,7 +83,7 @@ variables:
   value: 11.8
 
 - name: win_trt_home
-  value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+  value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
 - name: win_cuda_home
   value: $(Agent.TempDirectory)\v11.8
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index daf95af438d2b..9fd13b513e5fd 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -68,9 +68,9 @@ variables:
       value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
   - name: win_trt_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 5f63339fb0d00..3f9707ff50519 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -43,9 +43,9 @@ variables:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: 10.0.1.6-1.cuda11.8
+      value: 10.2.0.19-1.cuda11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: 10.0.1.6-1.cuda12.4
+      value: 10.2.0.19-1.cuda12.5
 
 jobs:
 - job: Linux_Build
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index b9a5383836447..56e9c73a10a82 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -61,7 +61,7 @@ stages:
           ${{ if eq(parameters.CudaVersion, '12.2') }}:
             DockerBuildArgs: "
             --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
-            --build-arg TRT_VERSION=10.0.1.6-1+cuda12.4
+            --build-arg TRT_VERSION=10.2.0.19-1+cuda12.5
             --build-arg BUILD_UID=$( id -u )
             "
           ${{ else }}:
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index f3604dba1ac9d..593d45361324e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -226,7 +226,7 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_trt.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 63e70fa8e6488..d57a7585f3cff 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -55,7 +55,7 @@ stages:
       python_wheel_suffix: '_gpu'
       timeout: 480
       docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-      trt_version: '10.0.1.6-1.cuda11.8'
+      trt_version: '10.2.0.19-1.cuda11.8'
       cuda_version: '11.8'
 
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index b6943f9e1b77b..7dfafeb67acf8 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -49,9 +49,9 @@ jobs:
           value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: 10.0.1.6-1.cuda11.8
+          value: 10.2.0.19-1.cuda11.8
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: 10.0.1.6-1.cuda12.4
+          value: 10.2.0.19-1.cuda12.5
     pool: ${{ parameters.machine_pool }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index cca53e36ebab9..2ca5129ac6e5d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -80,9 +80,9 @@ stages:
 
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self
       clean: true
@@ -149,9 +149,9 @@ stages:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
       submodules: false
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index 01f0337be7714..dcd681bd4b915 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -65,9 +65,9 @@ stages:
           SpecificArtifact: ${{ parameters.SpecificArtifact }}
           BuildId: ${{ parameters.BuildId }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
@@ -79,7 +79,7 @@ stages:
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-            trt_version: 10.0.1.6-1.cuda11.8
+            trt_version: 10.2.0.19-1.cuda11.8
           ${{ if eq(parameters.cuda_version, '12.2') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
-            trt_version: 10.0.1.6-1.cuda12.4
+            trt_version: 10.2.0.19-1.cuda12.5
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 0dd9ffd5282e7..de29a3de9fded 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.0.1.6'
+    default: '10.2.0.19'
     values:
       - 8.6.1.6
-      - 10.0.1.6
+      - 10.2.0.19
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,9 +42,9 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.0.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.2.0.19')) }}:
         - powershell: |
-            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.4"
+            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.5"
           displayName: Set trtCudaVersion
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 6c82958fc0b78..63d521f1e7d9a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -24,17 +24,11 @@ steps:
         displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}'
   - ${{ if eq(parameters.DownloadTRT, 'true') }}:
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8'
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5'
       
   - task: BatchScript@1
     displayName: 'setup env'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 97f95797be1f1..6c66cceb33d5c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -22,10 +22,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 3081624225b12..8eca22c8c123f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,10 +18,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 3f1c4ef0f8d61..47980955b8798 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -381,7 +381,7 @@ stages:
       variables:
         CUDA_VERSION: '11.8'
         buildArch: x64
-        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
+        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
         EnvSetupScript: setup_env_gpu.bat
         EP_NAME: gpu
         VSGenerator: 'Visual Studio 17 2022'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 9e14789f3b234..27f85dc5c1648 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -288,7 +288,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.8'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -298,7 +298,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.9'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -308,7 +308,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.10'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -318,7 +318,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.11'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -328,7 +328,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.12'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -498,7 +498,7 @@ stages:
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          trt_version: '10.0.1.6-1.cuda11.8'
+          trt_version: '10.2.0.19-1.cuda11.8'
           cuda_version: '11.8'
 
   - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 1af00da01241a..70c0c7d4a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -55,7 +55,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
-      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: 'x64'
       Platform: 'x64'
@@ -75,7 +75,7 @@ jobs:
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
      set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
 
     workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 86c178aae519b..2d3dc05285e3c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 4542d3a3f2e4c..a50788e98ffe0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 5ef56fd885ca7..1aca3e305452d 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
index 194a22850030c..5697120a48b2b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
deleted file mode 100644
index 8b32425afce1c..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
+++ /dev/null
@@ -1,63 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
-
-
-# ONNX Runtime Variables
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.4.1-1+cuda11.6" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-WORKDIR /code
-ENV CUDA_MODULE_LOADING "LAZY"
-
-# Prepare onnxruntime repository & build onnxruntime with TensorRT
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime &&\
-    /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\
-    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
-    cd .. 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
deleted file mode 100644
index cfc7023ef8e61..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
+++ /dev/null
@@ -1,92 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-# Build base image with required system packages
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
-
-# The local directory into which to build and install CMAKE
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.5.1-1+cuda11.8" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-# Build final image from base. Builds ORT.
-FROM base as final
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-
-# ONNX Runtime arguments
-
-# URL to the github repo from which to clone ORT.
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-
-# The local directory into which to clone ORT.
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-# The git branch of ORT to checkout and build.
-ARG ONNXRUNTIME_BRANCH=main
-
-# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
-ARG ONNXRUNTIME_COMMIT_ID
-
-# The supported CUDA architecture
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
-
-# Clone ORT repository with branch
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
-
-# Reset to a specific commit if specified by build args.
-RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
-    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
-    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
-
-# Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
-RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
-
-# Switch to root to continue following steps of CI
-USER root
-
-# Intall ORT wheel
-RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
similarity index 99%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index cd168e1911d95..0bd56a1a5873f 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda11.8" &&\
+RUN version="10.2.0.19-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
similarity index 83%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 3e48415118c63..7f66943dd8745 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,15 +30,27 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2 psutil
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda12.4" &&\
+RUN version="10.2.0.19-1+cuda12.5" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
-    tensorrt=${version}
+    libnvinfer-dev=${version} \
+    libnvinfer-dispatch-dev=${version} \
+    libnvinfer-dispatch10=${version} \
+    libnvinfer-headers-dev=${version} \
+    libnvinfer-headers-plugin-dev=${version} \
+    libnvinfer-lean-dev=${version} \
+    libnvinfer-lean10=${version} \
+    libnvinfer-plugin-dev=${version} \
+    libnvinfer-plugin10=${version} \
+    libnvinfer-vc-plugin-dev=${version} \
+    libnvinfer-vc-plugin10=${version} \
+    libnvinfer10=${version} \
+    libnvonnxparsers-dev=${version} \
+    libnvonnxparsers10=${version} \
+    tensorrt-dev=${version} \
+    libnvinfer-bin=${version}
 
 # Compile trtexec if not installed
 RUN if [ ! -d /usr/src/tensorrt/bin ] || [ ! -f /usr/src/tensorrt/bin/trtexec ]; then \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index a26bf88fbbdf6..0281c1c8fef25 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,9 +30,6 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
 # Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
 # See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index 3a7f410d3859e..a0020a9827290 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index b753cdae16b90..6c59866ea925a 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 
 @REM The default version is still cuda v11.8, because set cuda v12.2 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 4e43b5999a315..249bb98815897 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
\ No newline at end of file

From 92f66de702e95d6562d4ced5dba45115b8bca165 Mon Sep 17 00:00:00 2001
From: Frank Dong <123416088+frank-dong-ms@users.noreply.github.com>
Date: Thu, 18 Jul 2024 12:12:10 -0700
Subject: [PATCH 18/35] remove llama 70b (#21396)

Remove llama 70b model due to security reason.

We need add shard code in HF to enable model shardding for llama-70b,
these codes are not merged into main branch as HF forks want a more
general solution instead of doing shard for specify model. shared code
is kept here:
https://github.com/frank-dong-ms/transformers/tree/frdong/shard_llama

we kept llama-70b related code here for internal use:
https://github.com/frank-dong-ms/onnxruntime/tree/frdong/llama_70b
---
 .../tools/transformers/models/llama/README.md | 26 -------------------
 .../models/llama/benchmark_70b_model.sh       | 12 ---------
 .../models/llama/convert_70b_model.sh         | 12 ---------
 .../models/llama/requirements-70b-model.txt   |  4 ---
 4 files changed, 54 deletions(-)
 delete mode 100644 onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
 delete mode 100644 onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
 delete mode 100644 onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt

diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index 6fba98c14e792..cd8a8756d681e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -27,8 +27,6 @@ Please note the package versions needed for using LLaMA-2 in the `requirements.t
   - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
 - `requirements-quant.txt`
   - For running the SmoothQuant algorithm using [Intel's Neural Compressor](https://github.com/intel/neural-compressor)
-- `requirements-70b-model.txt`
-  - For running the LLaMA-2 70B model on multiple GPUs
 - `requirements.txt`
   - Package versions needed in each of the above files
 
@@ -221,18 +219,6 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu --use_gqa
 ```
 
-Export LLaMA-2 70B sharded model into 4 partitions
-```
-# From source:
-# 1. Install necessary packages from requirements-70b-model.txt
-$ pip install -r requirements-70b-model.txt
-
-# 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command:
-$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/
-
-# 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command:
-$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
-```
 
 ## Parity Checking LLaMA-2
 
@@ -395,18 +381,6 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --device cuda
 ```
 
-9. ONNX Runtime, FP16, convert_to_onnx, LLaMA-2 70B shard to 4 GPUs
-```
-CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
-    --benchmark-type ort-convert-to-onnx \
-    --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
-    --model-name meta-llama/Llama-2-70b-hf \
-    --cache-dir ./model_cache \
-    --precision fp16 \
-    --device cuda \
-    --warmup-runs 5 \
-    --num-runs 100
-```
 
 You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination.
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
deleted file mode 100644
index 38f1916456658..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python benchmark.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
deleted file mode 100644
index 637d15c10e0c7..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python convert_to_onnx.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
deleted file mode 100644
index 572cfdb71be4a..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
+++ /dev/null
@@ -1,4 +0,0 @@
--r requirements.txt
-git+https://github.com/frankdongms/transformers.git@frdong/shard_llama
-mpi4py
-psutil
\ No newline at end of file

From 05fc0c60cadab3652869226d2f2aa0bf7d9eda7d Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 18 Jul 2024 13:37:29 -0700
Subject: [PATCH 19/35] [MLAS][AArch64] SQNBitGemm CompInt8 - Use 4x2 tiles
 (#21380)

Update SQNBitGemm ARM NEON kernel to compute 4x2 tile of output.

Note: Also tried 2x4 and 4x4 tiles but observed the best microbenchmark results with 4x2 tiles.
---
 .../mlas/lib/sqnbitgemm_kernel_neon_int8.cpp  | 402 +++++++++++-------
 1 file changed, 244 insertions(+), 158 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
index db3b9ee656592..ec5cdbc75220a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
@@ -155,7 +155,7 @@ namespace
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLen16(
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
     const float* QuantBScaleColPtr,
@@ -177,11 +177,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -191,6 +193,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -212,13 +218,11 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         // TODO handling only 16 elements per accumulator at a time here, probably can do better
         {
-            // load A
-            const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
-
             // load B
             const uint8x8_t bv_packed_col0 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x8_t bv_packed_col1 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -242,24 +246,55 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
             bv_col0 = vsubq_s8(bv_col0, vdupq_n_s8(bzp_col0));
             bv_col1 = vsubq_s8(bv_col1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(dot00, av_row0, bv_col0);
-            dot01 = vdotq_s32(dot01, av_row0, bv_col1);
-            dot10 = vdotq_s32(dot10, av_row1, bv_col0);
-            dot11 = vdotq_s32(dot11, av_row1, bv_col1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(int32x4_t{}, av_row0, bv_col0);
+                const int32x4_t dot01 = vdotq_s32(int32x4_t{}, av_row0, bv_col1);
+                const int32x4_t dot10 = vdotq_s32(int32x4_t{}, av_row1, bv_col0);
+                const int32x4_t dot11 = vdotq_s32(int32x4_t{}, av_row1, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row3 = vld1q_s8(QuantADataPtrRow3 + 0);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(int32x4_t{}, av_row2, bv_col0);
+                const int32x4_t dot21 = vdotq_s32(int32x4_t{}, av_row2, bv_col1);
+                const int32x4_t dot30 = vdotq_s32(int32x4_t{}, av_row3, bv_col0);
+                const int32x4_t dot31 = vdotq_s32(int32x4_t{}, av_row3, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
         }
 
         // increment block pointers
@@ -273,22 +308,30 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16(
     size_t BlkLen,
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
@@ -312,11 +355,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -326,6 +371,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -347,14 +396,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; ++sub_blk_idx) {
-            // load A
-            const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
-            const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
-            const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
-
             // load B
             const uint8x16_t bv_packed_col0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x16_t bv_packed_col1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -372,28 +417,65 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
             bv_col1_0 = vsubq_s8(bv_col1_0, vdupq_n_s8(bzp_col1));
             bv_col1_1 = vsubq_s8(bv_col1_1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(vdotq_s32(dot00, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
-            dot01 = vdotq_s32(vdotq_s32(dot01, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
-            dot10 = vdotq_s32(vdotq_s32(dot10, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
-            dot11 = vdotq_s32(vdotq_s32(dot11, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
+                const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
+                const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
+                const int32x4_t dot01 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
+                const int32x4_t dot10 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
+                const int32x4_t dot11 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2_0 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row2_1 = vld1q_s8(QuantADataPtrRow2 + 16);
+                const int8x16_t av_row3_0 = vld1q_s8(QuantADataPtrRow3 + 0);
+                const int8x16_t av_row3_1 = vld1q_s8(QuantADataPtrRow3 + 16);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col0_0), av_row2_1, bv_col0_1);
+                const int32x4_t dot21 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col1_0), av_row2_1, bv_col1_1);
+                const int32x4_t dot30 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col0_0), av_row3_1, bv_col0_1);
+                const int32x4_t dot31 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col1_0), av_row3_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
 
             // increment block data pointers to next sub-block
             QuantADataPtrRow0 += 32;
             QuantADataPtrRow1 += 32;
+            QuantADataPtrRow2 += 32;
+            QuantADataPtrRow3 += 32;
             QuantBDataPtr += 16;
         }
 
@@ -407,16 +489,24 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
@@ -478,8 +568,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv1 = vsubq_s8(bv1, bzp1);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
-        const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
+        const int32x4_t dot1 = vdotq_s32(int32x4_t{}, av1, bv1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -527,7 +617,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv0 = vsubq_s8(bv0, bzp0);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -604,9 +694,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi1 = vsubq_s8(bv_hi1, bzp1);
 
         // quantized dot product
-        int32x4_t dot0{}, dot1{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
-        dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo1, bv_lo1), av_hi1, bv_hi1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -652,8 +741,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi0 = vsubq_s8(bv_hi0, bzp0);
 
         // quantized dot product
-        int32x4_t dot0{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -736,9 +824,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32(
             bv3 = vsubq_s8(bv3, bzp);
 
             // quantized dot product
-            int32x4_t dot0{}, dot1{};
-            dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1);
-            dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3);
+            const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av0, bv0), av1, bv1);
+            const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av2, bv2), av3, bv3);
 
             // convert to float
             const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -834,7 +921,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -845,8 +932,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLen16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLen16<HasZeroPoint>(
                 QuantARowPtr,
                 QuantBDataColPtr,
                 QuantBScaleColPtr,
@@ -871,38 +958,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -932,6 +1011,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -964,7 +1051,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -975,8 +1062,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1002,38 +1089,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1063,6 +1142,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -1095,7 +1182,7 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1106,8 +1193,8 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1133,40 +1220,31 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
+                    BlkLen,
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1197,6 +1275,14 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 

From 9140d9b1ff2383ba4f2a7db3b1efa9719f9f2ee4 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 18 Jul 2024 14:26:26 -0700
Subject: [PATCH 20/35] Update azure-kusto-data and azure-kusto-ingest (#21409)

A vulnerability has been found in the Kusto SDK. We need to update it to
latest to address a security alert.
---
 .../github/windows/post_to_dashboard/requirements.txt         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
index b8c00a610b781..6ece3c1f92c4e 100644
--- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
@@ -1,2 +1,2 @@
-azure-kusto-data[pandas]==3.0.1
-azure-kusto-ingest[pandas]==3.0.1
+azure-kusto-data[pandas]==4.5.1
+azure-kusto-ingest[pandas]==4.5.1

From cc4049af831034cd7d0b78e4b055b8994939e62a Mon Sep 17 00:00:00 2001
From: glen-amd <146770157+glen-amd@users.noreply.github.com>
Date: Fri, 19 Jul 2024 08:34:03 -0700
Subject: [PATCH 21/35] Enabled more VitisAI backend compilers (#21411)

### Description
Enabled more VitisAI backend compilers
---
 onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc    | 2 +-
 onnxruntime/core/providers/vitisai/include/ep_context_utils.h | 4 ++--
 .../core/providers/vitisai/vitisai_execution_provider.cc      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
index ab31aa313cf6d..368c8c0358228 100644
--- a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
+++ b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
@@ -466,7 +466,7 @@ std::string RetrieveEPContextCache(
   fs::path ep_ctx_fs_path(ep_ctx_model_loc);
   // Attr "ep_cache_context" stores a relative path.
   ep_ctx_fs_path.replace_filename(fs::path(ep_ctx_cache));
-  // TODO: Validaion of the file location to make sure security is met.
+  // TODO: Validation of the file location to make sure security is met.
   if (!fs::exists(ep_ctx_fs_path) || !fs::is_regular_file(ep_ctx_fs_path)) {
     ORT_THROW("File for EP context cache is missing");
   }
diff --git a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
index 61a595cf1ae15..26546f422765c 100644
--- a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
+++ b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
@@ -14,8 +14,8 @@ namespace fs = std::filesystem;
 namespace onnxruntime {
 
 constexpr const uint8_t kXCCode = 1;
-constexpr const uint8_t kDDCode = 2;
-constexpr const uint8_t kVCode = 4;
+[[maybe_unused]] constexpr const uint8_t kDDCode = 2;
+[[maybe_unused]] constexpr const uint8_t kVCode = 4;
 
 static constexpr const char* kEPContextOp = "EPContext";
 static constexpr const char* kMainContextAttr = "main_context";
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index f45b89649bfcb..036831df7a9cf 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -86,7 +86,7 @@ void VitisAIExecutionProvider::PrepareEPContextEnablement(
     model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string());
   }
   std::string backend_cache_dir, backend_cache_key;
-  get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode, backend_cache_dir, backend_cache_key, backend_cache_data_);
+  get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode | kDDCode | kVCode, backend_cache_dir, backend_cache_key, backend_cache_data_);
   info_["cacheDir"] = backend_cache_dir;
   info_["cacheKey"] = backend_cache_key;
   // Create a new model, reusing the graph name, the op-domain-to-opset-version map,

From 22d4d82f3c55525510bef785fab6c7c83a21c2e9 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 19 Jul 2024 08:36:47 -0700
Subject: [PATCH 22/35] Move ReluQuantFusion to Level2 for CPU EP only (#21329)

### Description
Moves the `Relu -> QuantizeLinear` fusion to Level2 optimizations for
CPU EP only.

### Motivation and Context
See the related PR for motivation and context:
https://github.com/microsoft/onnxruntime/pull/20627
---
 .../core/optimizer/graph_transformer_utils.cc |  2 +-
 .../qdq_transformer/relu_quantizelinear.cc    |  4 +-
 .../test/optimizer/qdq_transformer_test.cc    | 51 +++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 4298551aec412..e6feb3e7ddbe2 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -132,12 +132,12 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<ConvBNFusion>());
       rules.push_back(std::make_unique<PadFusion>());
       rules.push_back(std::make_unique<MatmulBNFusion>());
-      rules.push_back(std::make_unique<ReluQuantFusion>());
       rules.push_back(std::make_unique<LabelEncoderFusion>());
       break;
 
     case TransformerLevel::Level2:
       rules.push_back(std::make_unique<ClipQuantFusion>());
+      rules.push_back(std::make_unique<ReluQuantFusion>());
       rules.push_back(std::make_unique<GemmTransposeFusion>());
       break;
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
index 7417212c570c8..e756ffe78a289 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/relu_quantizelinear.cc
@@ -13,13 +13,15 @@ namespace onnxruntime {
 
 bool ReluQuantFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& /*logger*/) const {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Relu", {6, 13, 14}) ||
+      !graph_utils::IsSupportedProvider(node, {kCpuExecutionProvider}) ||
       !optimizer_utils::CheckOutputEdges(graph, node, 1)) {
     return false;
   }
 
   // if Relu is followed by QuantizeLinear, it can be fused into QuantizeLinear potentially
   const auto& next_node = *node.OutputNodesBegin();
-  if (!QDQ::MatchQNode(next_node)) {
+  if (!graph_utils::IsSupportedProvider(next_node, {kCpuExecutionProvider}) ||
+      !QDQ::MatchQNode(next_node)) {
     return false;
   }
 
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 1c77121ba9df1..1638851daf65a 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -2763,6 +2763,57 @@ TEST(QDQTransformerTests, Clip) {
   }
 }
 
+// Test that the ReluQuantFusion transformer only runs for optimization level >= 2.
+TEST(QDQTransformerTests, ReluQuantFusion_Level2Only) {
+  auto test_case = [&](TransformerLevel opt_level, int8_t zp) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      auto* input_arg = builder.MakeInput<int8_t>({1, 2, 2, 2},
+                                                  {-4, -3, -2, 0, 1, 2, 3, 4});
+      auto* output_arg = builder.MakeOutput();
+
+      // add DQ
+      auto* dq_output = builder.MakeIntermediate();
+      builder.AddDequantizeLinearNode<int8_t>(input_arg, 1.0f, zp, dq_output);
+
+      // add Relu
+      auto* relu_output = builder.MakeIntermediate();
+      builder.AddNode("Relu", {dq_output}, {relu_output});
+
+      // add Q + DQ
+      auto* q_output = builder.MakeIntermediate();
+      builder.AddQuantizeLinearNode<int8_t>(relu_output, 1.0f, zp, q_output);
+      builder.AddDequantizeLinearNode<int8_t>(q_output, 1.0f, zp, output_arg);
+    };
+
+    auto check_relu_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+      // Only fuse relu into Q if level >= 2 and zero_point == -128 for int8.
+      // Level1 graph:   input -> DQ -> Relu -> Q -> DQ -> output
+      // Level2+ graph: input -> DQ -> output (QuantReluFusion + QDQFinalCleanupTransformer transformers)
+      const bool fuse_relu = (zp == -128) &&
+                             (opt_level == TransformerLevel::Level2 || opt_level == TransformerLevel::Level3);
+      EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], fuse_relu ? 0 : 1);
+      EXPECT_EQ(op_to_count["Relu"], fuse_relu ? 0 : 1);
+      EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], fuse_relu ? 1 : 2);
+    };
+
+    constexpr float epsilon = std::numeric_limits<float>::epsilon();
+
+    TransformerTester(build_test_case, check_relu_graph,
+                      TransformerLevel::Default,
+                      opt_level,
+                      18,
+                      epsilon,
+                      epsilon);
+  };
+
+  test_case(TransformerLevel::Level1, -128);  // Will not fuse Relu into QuantizeLinear due to level1 opt.
+  test_case(TransformerLevel::Level2, -128);  // Will fuse Relu into QuantizeLinear.
+  test_case(TransformerLevel::Level3, -128);  // Will fuse Relu into QuantizeLinear.
+  test_case(TransformerLevel::Level3, 0);     // Will not fuse Relu into QuantizeLinear due to zero-point != -128
+}
+
 TEST(QDQTransformerTests, Concat) {
   auto test_case = [&](const std::vector<std::vector<int64_t>>& input_shapes,
                        int64_t axis,

From 01df8c787d872d5fffb3b48d70d9a0e3d6323e3c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:11:30 -0700
Subject: [PATCH 23/35] [js/web] fix vulnerable version of dependencies
 (#21412)

### Description
```
# npm audit report

socket.io  3.0.0 - 4.6.2
Severity: high
socket.io has an unhandled 'error' event - https://github.com/advisories/GHSA-25hc-qcg6-38wj
Depends on vulnerable versions of engine.io
fix available via `npm audit fix`
node_modules/socket.io

ws  8.0.0 - 8.17.0
Severity: high
ws affected by a DoS when handling a request with many HTTP headers - https://github.com/advisories/GHSA-3h5v-q93c-6h6q
fix available via `npm audit fix`
node_modules/ws
  engine.io  0.7.8 - 0.7.9 || 6.0.0 - 6.5.4
  Depends on vulnerable versions of ws
  node_modules/engine.io
  socket.io-adapter  2.5.2 - 2.5.4
  Depends on vulnerable versions of ws
  node_modules/socket.io-adapter

4 high severity vulnerabilities
```
---
 js/web/package-lock.json | 126 ++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 61 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index b802a4e8271a7..3cfc0457c6234 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -194,9 +194,9 @@
       }
     },
     "node_modules/@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "node_modules/@szmarczak/http-timer": {
@@ -236,9 +236,9 @@
       "dev": true
     },
     "node_modules/@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "dependencies": {
         "@types/node": "*"
@@ -1086,9 +1086,9 @@
       }
     },
     "node_modules/engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.5.5",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.5.5.tgz",
+      "integrity": "sha512-C5Pn8Wk+1vKBoHghJODM63yk8MvrO9EWZUfkAt5HAqIgPE4/8FF0PEGHXtEd40l223+cE5ABWuPzm38PHFXfMA==",
       "dev": true,
       "dependencies": {
         "@types/cookie": "^0.4.1",
@@ -1099,17 +1099,17 @@
         "cookie": "~0.4.1",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
@@ -3020,35 +3020,37 @@
       }
     },
     "node_modules/socket.io": {
-      "version": "4.6.1",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.1.tgz",
-      "integrity": "sha512-KMcaAi4l/8+xEjkRICl6ak8ySoxsYG+gG6/XfRCPJPQ/haCRIJBTL4wIl8YCsmtaBovcAXGLOShyVWQ/FG8GZA==",
+      "version": "4.7.5",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.7.5.tgz",
+      "integrity": "sha512-DmeAkF6cwM9jSfmp6Dr/5/mfMwb5Z5qRrSXLpo3Fq5SqyU8CMF15jIN4ZhfSwu35ksM1qmHZDQ/DK5XTccSTvA==",
       "dev": true,
       "dependencies": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.1",
+        "engine.io": "~6.5.2",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       },
       "engines": {
-        "node": ">=10.0.0"
+        "node": ">=10.2.0"
       }
     },
     "node_modules/socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
       "dev": true,
       "dependencies": {
-        "ws": "~8.11.0"
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
       }
     },
     "node_modules/socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "dependencies": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -3449,16 +3451,16 @@
       "dev": true
     },
     "node_modules/ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "engines": {
         "node": ">=10.0.0"
       },
       "peerDependencies": {
         "bufferutil": "^4.0.1",
-        "utf-8-validate": "^5.0.2"
+        "utf-8-validate": ">=5.0.2"
       },
       "peerDependenciesMeta": {
         "bufferutil": {
@@ -3648,9 +3650,9 @@
       "dev": true
     },
     "@socket.io/component-emitter": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.0.tgz",
-      "integrity": "sha512-+9jVqKhRSpsc591z5vX+X5Yyw+he/HCB4iQ/RYxw35CEPaY1gnsNE43nf9n9AaYjAQrTiI/mOwKUKdUs9vf7Xg==",
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
+      "integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
       "dev": true
     },
     "@szmarczak/http-timer": {
@@ -3687,9 +3689,9 @@
       "dev": true
     },
     "@types/cors": {
-      "version": "2.8.13",
-      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.13.tgz",
-      "integrity": "sha512-RG8AStHlUiV5ysZQKq97copd2UmVYw3/pRMLefISZ3S1hK104Cwm7iLQ3fTKx+lsUH2CE8FlLaYeEA2LSeqYUA==",
+      "version": "2.8.17",
+      "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
+      "integrity": "sha512-8CGDvrBj1zgo2qE+oS3pOCyYNqCPryMWY2bGfwA0dcfopWGgxs+78df0Rs3rc9THP4JkOhLsAa+15VdpAqkcUA==",
       "dev": true,
       "requires": {
         "@types/node": "*"
@@ -4379,9 +4381,9 @@
       }
     },
     "engine.io": {
-      "version": "6.4.2",
-      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.4.2.tgz",
-      "integrity": "sha512-FKn/3oMiJjrOEOeUub2WCox6JhxBXq/Zn3fZOMCBxKnNYtsdKjxhl7yR3fZhM9PV+rdE75SU5SYMc+2PGzo+Tg==",
+      "version": "6.5.5",
+      "resolved": "https://registry.npmjs.org/engine.io/-/engine.io-6.5.5.tgz",
+      "integrity": "sha512-C5Pn8Wk+1vKBoHghJODM63yk8MvrO9EWZUfkAt5HAqIgPE4/8FF0PEGHXtEd40l223+cE5ABWuPzm38PHFXfMA==",
       "dev": true,
       "requires": {
         "@types/cookie": "^0.4.1",
@@ -4392,14 +4394,14 @@
         "cookie": "~0.4.1",
         "cors": "~2.8.5",
         "debug": "~4.3.1",
-        "engine.io-parser": "~5.0.3",
-        "ws": "~8.11.0"
+        "engine.io-parser": "~5.2.1",
+        "ws": "~8.17.1"
       }
     },
     "engine.io-parser": {
-      "version": "5.0.6",
-      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.0.6.tgz",
-      "integrity": "sha512-tjuoZDMAdEhVnSFleYPCtdL2GXwVTGtNjoeJd9IhIG3C1xs9uwxqRNEu5WpnDZCaozwVlK/nuQhpodhXSIMaxw==",
+      "version": "5.2.3",
+      "resolved": "https://registry.npmjs.org/engine.io-parser/-/engine.io-parser-5.2.3.tgz",
+      "integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
       "dev": true
     },
     "ent": {
@@ -5862,32 +5864,34 @@
       "dev": true
     },
     "socket.io": {
-      "version": "4.6.1",
-      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.6.1.tgz",
-      "integrity": "sha512-KMcaAi4l/8+xEjkRICl6ak8ySoxsYG+gG6/XfRCPJPQ/haCRIJBTL4wIl8YCsmtaBovcAXGLOShyVWQ/FG8GZA==",
+      "version": "4.7.5",
+      "resolved": "https://registry.npmjs.org/socket.io/-/socket.io-4.7.5.tgz",
+      "integrity": "sha512-DmeAkF6cwM9jSfmp6Dr/5/mfMwb5Z5qRrSXLpo3Fq5SqyU8CMF15jIN4ZhfSwu35ksM1qmHZDQ/DK5XTccSTvA==",
       "dev": true,
       "requires": {
         "accepts": "~1.3.4",
         "base64id": "~2.0.0",
+        "cors": "~2.8.5",
         "debug": "~4.3.2",
-        "engine.io": "~6.4.1",
+        "engine.io": "~6.5.2",
         "socket.io-adapter": "~2.5.2",
-        "socket.io-parser": "~4.2.1"
+        "socket.io-parser": "~4.2.4"
       }
     },
     "socket.io-adapter": {
-      "version": "2.5.2",
-      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.2.tgz",
-      "integrity": "sha512-87C3LO/NOMc+eMcpcxUBebGjkpMDkNBS9tf7KJqcDsmL936EChtVva71Dw2q4tQcuVC+hAUy4an2NO/sYXmwRA==",
+      "version": "2.5.5",
+      "resolved": "https://registry.npmjs.org/socket.io-adapter/-/socket.io-adapter-2.5.5.tgz",
+      "integrity": "sha512-eLDQas5dzPgOWCk9GuuJC2lBqItuhKI4uxGgo9aIV7MYbk2h9Q6uULEh8WBzThoI7l+qU9Ast9fVUmkqPP9wYg==",
       "dev": true,
       "requires": {
-        "ws": "~8.11.0"
+        "debug": "~4.3.4",
+        "ws": "~8.17.1"
       }
     },
     "socket.io-parser": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.3.tgz",
-      "integrity": "sha512-JMafRntWVO2DCJimKsRTh/wnqVvO4hrfwOqtO7f+uzwsQMuxO6VwImtYxaQ+ieoyshWOTJyV0fA21lccEXRPpQ==",
+      "version": "4.2.4",
+      "resolved": "https://registry.npmjs.org/socket.io-parser/-/socket.io-parser-4.2.4.tgz",
+      "integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
       "dev": true,
       "requires": {
         "@socket.io/component-emitter": "~3.1.0",
@@ -6179,9 +6183,9 @@
       "dev": true
     },
     "ws": {
-      "version": "8.11.0",
-      "resolved": "https://registry.npmjs.org/ws/-/ws-8.11.0.tgz",
-      "integrity": "sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==",
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
+      "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
       "dev": true,
       "requires": {}
     },

From 6ffaaebb60cd43cf7749e67a9bb54c3bd2cc4efd Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 19 Jul 2024 13:58:54 -0700
Subject: [PATCH 24/35] [CUDA] Attention kernel provider option (#21344)

### Description
* Add a cuda provider option `sdpa_kernel` to choose which attention kernel to run for testing purpose.
* Allow dump which attention kernel is used per node.
* Reserve  a flag for cudnn flash attention which will be added soon.

#### CUDA provider option sdpa_kernel
Instead of setting environment variable, we also support setting it in
provider option. Note that the setting is global per session. That could
help performance testing of each kernel.

#### Attention Kernel Debug Info
Set an environment variable `ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO=1`,
and ORT will print sdpa kernel used in each node:

For example
```
ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO=1 ./onnxruntime_test_all --gtest_filter=MultiHeadAttentionTest*
```
It will show debug information of kernel used in testing:
```
[ RUN      ] MultiHeadAttentionTest.SelfAttention_Batch2_HeadSize32_NoBias_NoMask_PackedQKV
AttentionKernelOptions: FLASH_ATTENTION=0 EFFICIENT_ATTENTION=0 TRT_FUSED_ATTENTION=1 CUDNN_FLASH_ATTENTION=0 TRT_FLASH_ATTENTION=1 TRT_CROSS_ATTENTION=0 TRT_CAUSAL_ATTENTION=0 MATH=1
Operator=MultiHeadAttention Node=node1 DataType=fp16 TRT_FUSED_ATTENTION=1
AttentionKernelOptions: FLASH_ATTENTION=0 EFFICIENT_ATTENTION=1 TRT_FUSED_ATTENTION=0 CUDNN_FLASH_ATTENTION=0 TRT_FLASH_ATTENTION=0 TRT_CROSS_ATTENTION=0 TRT_CAUSAL_ATTENTION=0 MATH=1
Operator=MultiHeadAttention Node=node1 DataType=fp16 EFFICIENT_ATTENTION=1
```
In this test case, the debug info shows that one session uses trt fused
attention and another session use efficient attention.
---
 cmake/onnxruntime_rocm_hipify.cmake           |   2 +
 cmake/onnxruntime_unittests.cmake             |   3 +-
 .../providers/cuda/cuda_provider_options.h    |   1 +
 .../contrib_ops/cpu/bert/attention_common.h   |  28 ++-
 .../contrib_ops/cuda/bert/attention.cc        |  53 ++---
 onnxruntime/contrib_ops/cuda/bert/attention.h |   4 +-
 .../cuda/bert/attention_kernel_options.cc     | 166 +++++++++++++
 .../cuda/bert/attention_kernel_options.h      |  67 ++++++
 .../cuda/bert/group_query_attention.cc        |  30 +--
 .../cuda/bert/group_query_attention.h         |   2 +
 .../cuda/bert/multihead_attention.cc          |  50 ++--
 .../cuda/bert/multihead_attention.h           |   3 +-
 .../contrib_ops/cuda/bert/packed_attention.cc |  33 ++-
 .../contrib_ops/cuda/bert/packed_attention.h  |   9 +-
 .../cuda/bert/packed_multihead_attention.cc   |  40 ++--
 .../cuda/bert/packed_multihead_attention.h    |   4 +-
 .../providers/cuda/cuda_execution_provider.h  |  17 ++
 .../cuda/cuda_execution_provider_info.cc      |   4 +
 .../cuda/cuda_execution_provider_info.h       |   4 +
 onnxruntime/core/providers/cuda/cuda_kernel.h |   6 +
 .../providers/cuda/cuda_provider_factory.cc   |   2 +
 .../multihead_attention_op_test.cc            |   4 +-
 .../attention_kernel_options_test.cc          | 221 ++++++++++++++++++
 .../test/python/onnxruntime_test_python.py    |   2 +
 24 files changed, 645 insertions(+), 110 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
 create mode 100644 onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 2966a4624a966..a8c876d30873e 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -15,6 +15,8 @@ set(contrib_ops_excluded_files
   "bert/attention_softmax.h"
   "bert/attention_softmax.cu"
   "bert/attention_prepare_qkv.cu"
+  "bert/attention_kernel_options.h"
+  "bert/attention_kernel_options.cc"
   "bert/decoder_attention_impl.h"
   "bert/decoder_attention_impl.cu"
   "bert/decoder_masked_multihead_attention.h"
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 0159c35d1941b..38ed0b1640192 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -786,8 +786,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
+  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
-  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
   if (MSVC)
     # Cutlass code has an issue with the following:
     # warning C4100: 'magic': unreferenced formal parameter
diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
index 6d53760ab60b5..01a14de699dc4 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -38,4 +38,5 @@ struct OrtCUDAProviderOptionsV2 {
   int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
   int use_tf32 = 1;                                                                                            // use TF32
+  int sdpa_kernel = 0;                                                                                         // Scaled Dot Product Attention kernel option
 };
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index a5b9c84c63eb9..55292b35e1e38 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -147,6 +147,23 @@ constexpr const char* kDisableSparseAttentionV1 = "ORT_DISABLE_SPARSE_ATTENTION_
 }  // namespace sparse_attention
 
 namespace attention {
+
+enum class AttentionBackend : int {
+  FLASH_ATTENTION = 1,
+  EFFICIENT_ATTENTION = 2,
+  TRT_FUSED_ATTENTION = 4,
+  CUDNN_FLASH_ATTENTION = 8,  // reserved for cuDNN flash attention.
+  MATH = 16,                  // unfused kernel cannot be disabled right now.
+
+  // The following kernels might be deprecated in the future.
+  TRT_FLASH_ATTENTION = 32,
+  TRT_CROSS_ATTENTION = 64,
+  TRT_CAUSAL_ATTENTION = 128,
+};
+
+// Environment variable to enable debug information of attention kernel to be printed. Default is 0 (disabled).
+constexpr const char* kEnableAttentionKernelDebugInfo = "ORT_ENABLE_ATTENTION_KERNEL_DEBUG_INFO";
+
 // Environment variable to enable or disable TRT fused self attention kernel. Default is 0 (enabled).
 constexpr const char* kDisableFusedSelfAttention = "ORT_DISABLE_FUSED_ATTENTION";
 
@@ -157,6 +174,9 @@ constexpr const char* kDisableFusedCrossAttention = "ORT_DISABLE_FUSED_CROSS_ATT
 // Note that those causal attention kernels use fp16 accumulation. There is potential accuracy drop using those kernels.
 constexpr const char* kEnableFusedCausalAttention = "ORT_ENABLE_FUSED_CAUSAL_ATTENTION";
 
+// Environment variable to enable or disable cuDNN flash attention.
+constexpr const char* kEnableCudnnFlashAttention = "ORT_ENABLE_CUDNN_FLASH_ATTENTION";
+
 // Environment variable to enable or disable TRT flash attention. This applies to both self and causal attention. Default is 0 (enabled).
 constexpr const char* kDisableTrtFlashAttention = "ORT_DISABLE_TRT_FLASH_ATTENTION";
 
@@ -166,11 +186,15 @@ constexpr const char* kDisableMemoryEfficientAttention = "ORT_DISABLE_MEMORY_EFF
 // Environment variable to enable or disable flash attention. Default is 0 (enabled).
 constexpr const char* kDisableFlashAttention = "ORT_DISABLE_FLASH_ATTENTION";
 
-// Minimum sequence length to enable memory efficient attention in FP32.
-constexpr int kMinSeqLenForMemoryEfficientAttentionFp32 = 256;
+// Minimum sequence length to perfer memory efficient attention when data type is float32
+constexpr const char* kMinSeqLenForEfficientAttentionFp32 = "ORT_MIN_SEQ_LEN_EFFICIENT_ATTENTION_FP32";
+
+// Default value for minimum sequence length to enable memory efficient attention in FP32.
+constexpr int kDefaultMinSeqLenForEfficientAttentionFp32 = 256;
 
 // Minimum sequence length to prefer flash attention when input format is packed QKV for MultiHeadAttention
 constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV";
+
 // Default value for the above setting.
 constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index d9907f09121d0..cacd65313ebcc 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -3,7 +3,6 @@
 
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
-#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "contrib_ops/cuda/bert/attention.h"
 #include "contrib_ops/cuda/bert/bert_padding.h"
@@ -40,36 +39,17 @@ REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
 Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info), AttentionBase(info, false) {
-  disable_fused_self_attention_ =
-      sizeof(T) != 2 ||
-      ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
+  kernel_options_ = this->GetAttentionKernelOptions();
 
-  enable_trt_flash_attention_ =
-      sizeof(T) == 2 &&
-      !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
 
-  enable_fused_causal_attention_ =
-      sizeof(T) == 2 &&
-      ParseEnvironmentVariableWithDefault<bool>(attention::kEnableFusedCausalAttention, false);
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ =
-      ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  enable_fused_causal_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtCausalAttention();
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ =
-      sizeof(T) != 2 ||
-      onnxruntime::ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
+
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 }
 
 template <typename T>
@@ -134,7 +114,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                                               parameters.num_heads,
                                                               parameters.num_heads);
   // When input is packed QKV format, TensorRT kernel might be faster when sequence length <= 512.
-  if (use_flash_attention && parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+  if (use_flash_attention && parameters.sequence_length < kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
     use_flash_attention = false;
   }
   // Allocate buffers
@@ -220,7 +200,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
       nullptr == past &&
       nullptr == present &&
       (nullptr == mask_index || parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) &&
-      (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+      (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
       has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size);
 
   if (use_memory_efficient_attention) {
@@ -231,6 +211,20 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   constexpr bool use_memory_efficient_attention = false;
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(is_unidirectional_, enable_trt_flash_attention_, sequence_length);
+    }
+
+    debug_info.Print("Attention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   cublasHandle_t cublas = GetCublasHandle(context);
 
   typedef typename ToCudaType<T>::MappedType CudaT;
@@ -268,7 +262,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                                    use_fused_cross_attention,
                                                    use_memory_efficient_attention);
   IAllocatorUniquePtr<void> work_space = IAllocator::MakeUniquePtr<void>(allocator, workSpaceSize, false, context->GetComputeStream());
-  ;
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   AttentionData<CudaT> data;
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.h b/onnxruntime/contrib_ops/cuda/bert/attention.h
index acafb379d713f..0c7d3621f95ef 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.h
@@ -8,6 +8,7 @@
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cpu/bert/attention_base.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -27,9 +28,10 @@ class Attention final : public CudaKernel, public AttentionBase {
   bool enable_trt_flash_attention_;
   bool enable_fused_causal_attention_;
   bool disable_memory_efficient_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
   mutable std::once_flag fused_fp16_runner_created_;
+
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
new file mode 100644
index 0000000000000..28a095e68131e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.cc
@@ -0,0 +1,166 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include "contrib_ops/cpu/bert/attention_common.h"
+#include "core/providers/shared_library/provider_api.h"
+#include "core/platform/env_var_utils.h"
+#include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
+
+using namespace onnxruntime::contrib::attention;
+
+namespace onnxruntime {
+void AttentionKernelOptions::Initialize(int value, bool use_build_flag) {
+  if (value > 0) {
+    use_flash_attention_ = (value & static_cast<int>(AttentionBackend::FLASH_ATTENTION)) > 0;
+    use_efficient_attention_ = (value & static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION)) > 0;
+    use_trt_fused_attention_ = (value & static_cast<int>(AttentionBackend::TRT_FUSED_ATTENTION)) > 0;
+    use_cudnn_flash_attention_ = (value & static_cast<int>(AttentionBackend::CUDNN_FLASH_ATTENTION)) > 0;
+    use_unfused_ = (value & static_cast<int>(AttentionBackend::MATH)) > 0;
+    use_trt_flash_attention_ = (value & static_cast<int>(AttentionBackend::TRT_FLASH_ATTENTION)) > 0;
+    use_trt_cross_attention_ = (value & static_cast<int>(AttentionBackend::TRT_CROSS_ATTENTION)) > 0;
+    use_trt_causal_attention_ = (value & static_cast<int>(AttentionBackend::TRT_CAUSAL_ATTENTION)) > 0;
+  } else {
+    use_flash_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFlashAttention, false);
+    use_efficient_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableMemoryEfficientAttention, false);
+    use_trt_fused_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFusedSelfAttention, false);
+    use_cudnn_flash_attention_ = ParseEnvironmentVariableWithDefault<bool>(kEnableCudnnFlashAttention, false);
+    use_unfused_ = true;
+    use_trt_flash_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableTrtFlashAttention, false);
+    use_trt_cross_attention_ = !ParseEnvironmentVariableWithDefault<bool>(kDisableFusedCrossAttention, false);
+    use_trt_causal_attention_ = ParseEnvironmentVariableWithDefault<bool>(kEnableFusedCausalAttention, false);
+  }
+
+  enable_kernel_debug_info_ = ParseEnvironmentVariableWithDefault<bool>(kEnableAttentionKernelDebugInfo, false);
+
+  // When value is positive, we use 0 as default minimum sequence lengths to align with common usage in testing.
+  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
+      kMinSeqLenForFlashAttentionPackedQKV,
+      value > 0 ? 0 : kDefaultMinSeqLenForFlashAttentionPackedQKV);
+
+  min_seq_len_for_efficient_attention_fp32_ = ParseEnvironmentVariableWithDefault<int>(
+      kMinSeqLenForEfficientAttentionFp32,
+      value > 0 ? 0 : kDefaultMinSeqLenForEfficientAttentionFp32);
+
+  if (use_build_flag) {
+    // Some kernels can be disabled at build time. If they are disabled, we should not use them.
+#ifndef USE_FLASH_ATTENTION
+    use_flash_attention_ = false;
+#endif
+
+#ifndef USE_MEMORY_EFFICIENT_ATTENTION
+    use_efficient_attention_ = false;
+#endif
+  }
+}
+
+void AttentionKernelOptions::InitializeOnce(
+    int sdpa_kernel, bool use_build_flag) {
+  std::call_once(this->initialize_once_flag_, [&]() {
+    this->Initialize(sdpa_kernel, use_build_flag);
+    if (this->enable_kernel_debug_info_) {
+      this->Print();
+    }
+  });
+}
+
+void AttentionKernelOptions::Print() const {
+  std::stringstream sstream;
+  sstream << "AttentionKernelOptions:";
+  sstream << " FLASH_ATTENTION=" << int(use_flash_attention_);
+  sstream << " EFFICIENT_ATTENTION=" << int(use_efficient_attention_);
+  sstream << " TRT_FUSED_ATTENTION=" << int(use_trt_fused_attention_);
+  sstream << " CUDNN_FLASH_ATTENTION=" << int(use_cudnn_flash_attention_);
+  sstream << " TRT_FLASH_ATTENTION=" << int(use_trt_flash_attention_);
+  sstream << " TRT_CROSS_ATTENTION=" << int(use_trt_cross_attention_);
+  sstream << " TRT_CAUSAL_ATTENTION=" << int(use_trt_causal_attention_);
+  sstream << " MATH=" << int(use_unfused_);
+
+  if (!use_unfused_) {
+    sstream << std::endl
+            << "Warning: Unfused kernel cannot be disabled right now. MATH=0 is ignored.";
+  }
+
+  // Output text in Cyan color to make it easier to spot
+  std::cout << "\x1B[36m" << sstream.str() << "\x1B[0m" << std::endl;
+}
+
+// Classify the kernel used in TRT fused runner.
+void AttentionKernelDebugInfo::SetTrtFusedKernel(bool causal, bool enable_trt_flash_attention, int sequence_length) {
+  if (causal) {
+    use_trt_causal_attention = true;
+  } else if (enable_trt_flash_attention && sequence_length >= contrib::cuda::kMinSequenceLengthFlashAttention) {
+    use_trt_flash_attention = true;
+  } else {
+    use_trt_fused_attention = true;
+  }
+}
+
+void AttentionKernelDebugInfo::Print(const char* operator_name,
+                                     const std::string& node_name,
+                                     bool is_float16,
+                                     bool is_bfloat16) const {
+  std::stringstream sstream;
+  sstream << "Operator=" << operator_name;
+
+  if (node_name.length() > 0) {
+    sstream << " Node=" << node_name;
+  }
+
+  if (is_bfloat16) {
+    sstream << " DataType=bf16";
+  } else if (is_float16) {
+    sstream << " DataType=fp16";
+  } else {
+    sstream << " DataType=fp32";
+  }
+
+  if (use_flash_attention.has_value() && use_flash_attention.value()) {
+    sstream << " FLASH_ATTENTION=" << int(use_flash_attention.value());
+  }
+
+  if (use_efficient_attention.has_value() && use_efficient_attention.value()) {
+    sstream << " EFFICIENT_ATTENTION=" << int(use_efficient_attention.value());
+  }
+
+  if (use_trt_fused_attention.has_value() && use_trt_fused_attention.value()) {
+    sstream << " TRT_FUSED_ATTENTION=" << int(use_trt_fused_attention.value());
+  }
+
+  if (use_cudnn_flash_attention.has_value() && use_cudnn_flash_attention.value()) {
+    sstream << " CUDNN_FLASH_ATTENTION=" << int(use_cudnn_flash_attention.value());
+  }
+
+  if (use_trt_flash_attention.has_value() && use_trt_flash_attention.value()) {
+    sstream << " TRT_FLASH_ATTENTION=" << int(use_trt_flash_attention.value());
+  }
+
+  if (use_trt_cross_attention.has_value() && use_trt_cross_attention.value()) {
+    sstream << " TRT_CROSS_ATTENTION=" << int(use_trt_cross_attention.value());
+  }
+
+  if (use_trt_causal_attention.has_value() && use_trt_causal_attention.value()) {
+    sstream << " TRT_CAUSAL_ATTENTION=" << int(use_trt_causal_attention.value());
+  }
+
+  bool use_fused = (use_flash_attention.has_value() && use_flash_attention.value()) ||
+                   (use_efficient_attention.has_value() && use_efficient_attention.value()) ||
+                   (use_trt_fused_attention.has_value() && use_trt_fused_attention.value()) ||
+                   (use_cudnn_flash_attention.has_value() && use_cudnn_flash_attention.value()) ||
+                   (use_trt_flash_attention.has_value() && use_trt_flash_attention.value()) ||
+                   (use_trt_cross_attention.has_value() && use_trt_cross_attention.value()) ||
+                   (use_trt_causal_attention.has_value() && use_trt_causal_attention.value());
+
+  // Fall back to unfused when no fused kernel is enabled.
+  if (!use_fused) {
+    sstream << " MATH=1";
+  }
+
+  // Output text in Cyan color to make it easier to spot.
+  std::cout << "\x1B[36m" << sstream.str() << "\x1B[0m" << std::endl;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
new file mode 100644
index 0000000000000..bd7df5f490c76
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_kernel_options.h
@@ -0,0 +1,67 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <mutex>
+#include <optional>
+#include <string>
+
+namespace onnxruntime {
+struct AttentionKernelDebugInfo {
+  std::optional<bool> use_flash_attention = std::nullopt;
+  std::optional<bool> use_efficient_attention = std::nullopt;
+  std::optional<bool> use_trt_fused_attention = std::nullopt;
+  std::optional<bool> use_cudnn_flash_attention = std::nullopt;
+  std::optional<bool> use_trt_flash_attention = std::nullopt;
+  std::optional<bool> use_trt_cross_attention = std::nullopt;
+  std::optional<bool> use_trt_causal_attention = std::nullopt;
+  void SetTrtFusedKernel(bool causal, bool enable_trt_flash_attention, int sequence_length);
+  void Print(const char* operator_name, const std::string& node_name, bool is_float16, bool is_bfloat16) const;
+};
+
+class AttentionKernelOptions {
+ public:
+  void InitializeOnce(int sdpa_kernel, bool use_build_flag);
+
+  bool UseFlashAttention() const { return use_flash_attention_; }
+  bool UseEfficientAttention() const { return use_efficient_attention_; }
+  bool UseTrtFusedAttention() const { return use_trt_fused_attention_; }
+  bool UseCudnnFlashAttention() const { return use_cudnn_flash_attention_; }
+  bool UseUnfusedAttention() const { return use_unfused_; }
+  bool UseTrtFlashAttention() const { return use_trt_flash_attention_; }
+  bool UseTrtCrossAttention() const { return use_trt_cross_attention_; }
+  bool UseTrtCausalAttention() const { return use_trt_causal_attention_; }
+
+  bool AllowDebugInfo() const { return enable_kernel_debug_info_; }
+
+  int MinSeqLenForFlashAttentionPackedQkv() const { return min_seq_len_for_flash_attention_packed_qkv_; }
+  int MinSeqLenForEfficientAttentionFp32() const { return min_seq_len_for_efficient_attention_fp32_; }
+
+ protected:
+  void Print() const;
+
+  void Initialize(int value, bool use_build_flag);
+
+ private:
+  bool use_flash_attention_{true};
+  bool use_efficient_attention_{true};
+  bool use_trt_fused_attention_{true};
+  bool use_cudnn_flash_attention_{false};
+  bool use_unfused_{true};
+
+  bool use_trt_flash_attention_{true};
+  bool use_trt_cross_attention_{true};
+
+  // Causal attention is disabled by default in #14732.
+  bool use_trt_causal_attention_{false};
+
+  bool enable_kernel_debug_info_{false};
+
+  int min_seq_len_for_flash_attention_packed_qkv_{0};
+
+  int min_seq_len_for_efficient_attention_fp32_{0};
+
+  std::once_flag initialize_once_flag_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 3b6ad238cc826..797f9b0a1ea47 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -52,20 +52,13 @@ GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
   rotary_interleaved_ = info.GetAttrOrDefault<int64_t>("rotary_interleaved", 0) == 1;
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 ||
-                             ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-#else
-  disable_flash_attention_ = true;
-#endif
+  kernel_options_ = this->GetAttentionKernelOptions();
+
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
   // Memory efficient attention only supports float and float16, not bfloat16.
-  disable_memory_efficient_attention_ = std::is_same<T, BFloat16>::value ||
-                                        ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = std::is_same<T, BFloat16>::value || !kernel_options_->UseEfficientAttention();
+
   if (!disable_flash_attention_) {
     zeros_ = this->GetScratchBuffer<int>(kZerosCount, nullptr);
   }
@@ -161,7 +154,7 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
       !use_flash_attention &&
       !disable_memory_efficient_attention_ &&
       local_window_size_ == -1 &&
-      (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+      (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
       has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.head_size);
   if (!use_flash_attention && !use_memory_efficient_attention && local_window_size_ != -1) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -201,6 +194,17 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* context) const {
   auto unpacked_qkv_buffer = GetScratchBuffer<void>(0, context->GetComputeStream());
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+
+    debug_info.Print("GroupQueryAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   // seqlens_k buffer
   size_t seqlens_k_bytes = 0;
   seqlens_k_bytes = sizeof(int) * parameters.batch_size;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
index 15573ece166fc..4ff5b0a59f021 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.h
@@ -6,6 +6,7 @@
 #include <memory>
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cuda/bert/group_query_attention_impl.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -32,6 +33,7 @@ class GroupQueryAttention final : public CudaKernel {
   bool disable_memory_efficient_attention_;
   static constexpr int kZerosCount = 256;  // In prompt case we create a zero buffer of size 256 for seqlen (assume batch_size <= 256)
   IAllocatorUniquePtr<int> zeros_;
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index ba8b00df07e06..b96140f3897f9 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cuda/cuda_common.h"
-#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
 #include "contrib_ops/cuda/bert/multihead_attention.h"
 #include "contrib_ops/cpu/bert/multihead_attention_helper.h"
@@ -47,31 +46,16 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
   ORT_ENFORCE(!is_unidirectional_, "Unidirectional MHA does not support CUDA kernel. Consider using Attention or GQA instead.");
 
-  disable_fused_self_attention_ = sizeof(T) != 2 ||
-                                  ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
+  kernel_options_ = this->GetAttentionKernelOptions();
 
-  enable_trt_flash_attention_ = sizeof(T) == 2 &&
-                                !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 ||
-                             ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ = ParseEnvironmentVariableWithDefault<bool>(attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
 
-  disable_fused_cross_attention_ = sizeof(T) != 2 ||
-                                   ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedCrossAttention, false);
+  disable_fused_cross_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtCrossAttention();
 
   // Allocate cache buffers
   constexpr size_t cache_bytes = sizeof(int32_t) * (static_cast<size_t>(kCumulatedSequenceLengthCacheMaxBatchSize) + 1);
@@ -155,7 +139,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                               parameters.num_heads);
   // When input is packed QKV format, TensorRT kernel might be faster than flash attention when sequence length <= 512.
   if (use_flash_attention && key == nullptr && value == nullptr &&
-      parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+      parameters.sequence_length < kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
     use_flash_attention = false;
   }
   // Allocate buffers
@@ -229,9 +213,10 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
+  int length_threshold = this->kernel_options_->MinSeqLenForEfficientAttentionFp32();
   bool is_long_sequence = sizeof(T) == 2 ||  // sequence length threshold is 0 for FP16
-                          parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32 ||
-                          parameters.kv_sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32;
+                          parameters.sequence_length >= length_threshold ||
+                          parameters.kv_sequence_length >= length_threshold;
 
   bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0;
 
@@ -249,6 +234,21 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
   constexpr bool use_memory_efficient_attention = false;
 #endif
 
+  if (kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_trt_cross_attention = fused_cross_attention_kernel != nullptr;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_fp16_runner_ != nullptr) {
+      debug_info.SetTrtFusedKernel(is_unidirectional_, enable_trt_flash_attention_, sequence_length);
+    }
+
+    debug_info.Print("MultiHeadAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   // When packed kv or packed qkv is used, there is no needed for add bias transpose thus no qkv workspace.
   // TODO(tianleiwu): flash attention or memory efficient attention might not need qkv workspace sometime.
   bool no_qkv_workspace = nullptr == value &&
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
index 86a32c92ce003..26e38dbad9fd7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.h
@@ -8,6 +8,7 @@
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h"
 #include "contrib_ops/cuda/bert/attention_impl.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -31,12 +32,12 @@ class MultiHeadAttention final : public CudaKernel {
   bool disable_fused_cross_attention_;
   bool disable_flash_attention_;
   bool disable_memory_efficient_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
   mutable std::once_flag fused_fp16_runner_created_;
   mutable const FusedMultiHeadCrossAttentionKernel* fused_fp16_cross_attention_kernel_;
   mutable CumulatedSequenceLengthCache cumulated_sequence_length_q_cache_;
   mutable CumulatedSequenceLengthCache cumulated_sequence_length_kv_cache_;
+  const AttentionKernelOptions* kernel_options_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
index 0146cce30c7d1..a1149ddbf99f5 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -33,12 +33,11 @@ REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
-TrtFusedAttention<T>::TrtFusedAttention() {
-  disable_fused_runner_ = sizeof(T) != 2 ||
-                          ParseEnvironmentVariableWithDefault<bool>(attention::kDisableFusedSelfAttention, false);
-
-  enable_trt_flash_attention_ = sizeof(T) == 2 &&
-                                !ParseEnvironmentVariableWithDefault<bool>(attention::kDisableTrtFlashAttention, false);
+TrtFusedAttention<T>::TrtFusedAttention(const OpKernelInfo& info)
+    : CudaKernel(info) {
+  kernel_options_ = this->GetAttentionKernelOptions();
+  disable_fused_runner_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
 }
 
 template <typename T>
@@ -86,7 +85,8 @@ template class TrtFusedAttention<float>;
 template class TrtFusedAttention<MLFloat16>;
 
 template <typename T>
-PackedAttention<T>::PackedAttention(const OpKernelInfo& info) : TrtFusedAttention<T>(), CudaKernel(info) {
+PackedAttention<T>::PackedAttention(const OpKernelInfo& info)
+    : TrtFusedAttention<T>(info) {
   int64_t num_heads = 0;
   ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
   num_heads_ = static_cast<int32_t>(num_heads);
@@ -268,7 +268,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* relative_position_bias = context->Input<Tensor>(5);
 
   PackedAttentionParameters parameters;
-  parameters.use_tf32 = UseTF32();
+  parameters.use_tf32 = this->UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
@@ -295,6 +295,19 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 #endif
 
+  if (this->kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(false /*causal*/, this->enable_trt_flash_attention_, parameters.sequence_length);
+    }
+
+    debug_info.Print("PackedAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT one = ToCudaType<T>::FromFloat(1.0f);
   CudaT zero = ToCudaType<T>::FromFloat(0.0f);
@@ -313,7 +326,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
       cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
       reinterpret_cast<const CudaT*>(weights->Data<T>()), n,
       reinterpret_cast<const CudaT*>(input->Data<T>()), k,
-      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, UseTF32()));
+      &zero, reinterpret_cast<CudaT*>(gemm_buffer.get()), n, device_prop, this->UseTF32()));
 
   constexpr size_t element_size = sizeof(T);
   constexpr bool no_qkv_workspace = false;  // need workspace to add bias
@@ -341,7 +354,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   data.fused_runner = reinterpret_cast<void*>(fused_runner);
   data.use_memory_efficient_attention = use_memory_efficient_attention;
 
-  return QkvToContext<CudaT>(device_prop, cublas, Stream(context), parameters, data);
+  return QkvToContext<CudaT>(device_prop, cublas, this->Stream(context), parameters, data);
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.h b/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
index f00c112fc73d2..67b420764169a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.h
@@ -9,6 +9,7 @@
 #include "core/providers/cuda/cuda_kernel.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cpu/bert/attention_common.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -17,14 +18,16 @@ namespace cuda {
 using namespace onnxruntime::cuda;
 
 template <typename T>
-class TrtFusedAttention {
+class TrtFusedAttention : public CudaKernel {
  public:
-  TrtFusedAttention();
+  TrtFusedAttention(const OpKernelInfo& info);
 
  protected:
   MHARunner* GetFusedRunner(const cudaDeviceProp& device_prop, const PackedAttentionParameters& parameters) const;
 
  protected:
+  const AttentionKernelOptions* kernel_options_;
+
   bool disable_fused_runner_;
   bool enable_trt_flash_attention_;
   mutable std::unique_ptr<MHARunner> fused_fp16_runner_;
@@ -32,7 +35,7 @@ class TrtFusedAttention {
 };
 
 template <typename T>
-class PackedAttention final : public TrtFusedAttention<T>, public CudaKernel {
+class PackedAttention final : public TrtFusedAttention<T> {
  public:
   PackedAttention(const OpKernelInfo& info);
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
index 3fbbafc01254e..53e96fc732a33 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
@@ -35,30 +35,16 @@ REGISTER_KERNEL_TYPED(MLFloat16)
 
 template <typename T>
 PackedMultiHeadAttention<T>::PackedMultiHeadAttention(const OpKernelInfo& info)
-    : TrtFusedAttention<T>(), CudaKernel(info) {
+    : TrtFusedAttention<T>(info) {
   int64_t num_heads = 0;
   ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
   num_heads_ = static_cast<int32_t>(num_heads);
 
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
-#if USE_FLASH_ATTENTION
-  disable_flash_attention_ = sizeof(T) != 2 || onnxruntime::ParseEnvironmentVariableWithDefault<bool>(
-                                                   attention::kDisableFlashAttention, false);
-  min_seq_len_for_flash_attention_packed_qkv_ = ParseEnvironmentVariableWithDefault<int>(
-      attention::kMinSeqLenForFlashAttentionPackedQKV,
-      attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
-#else
-  disable_flash_attention_ = true;
-  min_seq_len_for_flash_attention_packed_qkv_ = 0;
-#endif
+  disable_flash_attention_ = sizeof(T) != 2 || !this->kernel_options_->UseFlashAttention();
 
-#if USE_MEMORY_EFFICIENT_ATTENTION
-  disable_memory_efficient_attention_ = onnxruntime::ParseEnvironmentVariableWithDefault<bool>(
-      attention::kDisableMemoryEfficientAttention, false);
-#else
-  disable_memory_efficient_attention_ = true;
-#endif
+  disable_memory_efficient_attention_ = !this->kernel_options_->UseEfficientAttention();
 }
 
 template <typename T>
@@ -228,7 +214,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
   const Tensor* relative_position_bias = context->Input<Tensor>(6);
 
   PackedAttentionParameters parameters;
-  parameters.use_tf32 = UseTF32();
+  parameters.use_tf32 = this->UseTF32();
   ORT_RETURN_IF_ERROR(CheckInputs(query->Shape(),
                                   key,
                                   value,
@@ -255,7 +241,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
 
     // When input is packed QKV format, TensorRT kernel might be faster when sequence length <= 512.
     if (use_flash_attention && key == nullptr && value == nullptr &&
-        parameters.sequence_length < min_seq_len_for_flash_attention_packed_qkv_) {
+        parameters.sequence_length < this->kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
       use_flash_attention = false;
     }
   }
@@ -271,11 +257,25 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
     bool is_good_for_rpb = !parameters.has_relative_position_bias || parameters.sequence_length % (4 * sizeof(T)) == 0;
     use_memory_efficient_attention =
         is_good_for_rpb &&
-        (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) &&
+        (sizeof(T) == 2 || parameters.sequence_length >= this->kernel_options_->MinSeqLenForEfficientAttentionFp32()) &&
         has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size);
   }
 #endif
 
+  if (this->kernel_options_->AllowDebugInfo()) {
+    AttentionKernelDebugInfo debug_info;
+    debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
+    if (fused_runner != nullptr) {
+      debug_info.SetTrtFusedKernel(false /*causal*/, this->enable_trt_flash_attention_, parameters.sequence_length);
+    }
+
+    debug_info.Print("PackedMultiHeadAttention",
+                     this->Node().Name(),
+                     std::is_same<T, MLFloat16>::value,
+                     std::is_same<T, BFloat16>::value);
+  }
+
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   cublasHandle_t cublas = this->GetCublasHandle(context);
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
index e30c603dc30aa..9b52a70fc6181 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.h
@@ -4,13 +4,14 @@
 #pragma once
 
 #include "contrib_ops/cuda/bert/packed_attention.h"
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
 
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
 template <typename T>
-class PackedMultiHeadAttention final : public TrtFusedAttention<T>, public CudaKernel {
+class PackedMultiHeadAttention final : public TrtFusedAttention<T> {
  public:
   PackedMultiHeadAttention(const OpKernelInfo& info);
   Status ComputeInternal(OpKernelContext* context) const override;
@@ -32,7 +33,6 @@ class PackedMultiHeadAttention final : public TrtFusedAttention<T>, public CudaK
 
   bool disable_memory_efficient_attention_;
   bool disable_flash_attention_;
-  int min_seq_len_for_flash_attention_packed_qkv_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index f53779058a8af..9c8a8712ca51c 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -17,6 +17,10 @@
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/tunable/cuda_tuning_context.h"
 
+#ifndef DISABLE_CONTRIB_OPS
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#endif
+
 namespace onnxruntime {
 
 void RunOnUnload(std::function<void()> function);
@@ -80,6 +84,14 @@ class CUDAExecutionProvider : public IExecutionProvider {
   bool IsNHWCPreferred() const { return info_.prefer_nhwc; }
   bool UseTF32() const { return info_.use_tf32; }
 
+#ifndef DISABLE_CONTRIB_OPS
+  // Attention kernel options parsed from sdpa_kernel cuda provider option.
+  const AttentionKernelOptions* GetAttentionKernelOptions() const {
+    attention_kernel_options_.InitializeOnce(info_.sdpa_kernel, true);
+    return &attention_kernel_options_;
+  }
+#endif
+
   ProviderOptions GetProviderOptions() const override {
     return CUDAExecutionProviderInfo::ToProviderOptions(info_);
   }
@@ -110,6 +122,11 @@ class CUDAExecutionProvider : public IExecutionProvider {
   // the tuning context might be altered when calling into a TunableOp
   mutable cuda::tunable::CudaTuningContext tuning_context_;
 
+#ifndef DISABLE_CONTRIB_OPS
+  // Attention kernel options parsed from sdpa_kernel cuda provider option.
+  mutable AttentionKernelOptions attention_kernel_options_;
+#endif
+
   class PerThreadContext final {
    public:
     PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy,
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
index c96381e3e68b1..31cf991a34fc9 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
@@ -34,6 +34,7 @@ constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_s
 constexpr const char* kPreferNHWCMode = "prefer_nhwc";
 constexpr const char* kUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
 constexpr const char* kUseTF32 = "use_tf32";
+constexpr const char* kSdpaKernel = "sdpa_kernel";
 
 }  // namespace provider_option_names
 }  // namespace cuda
@@ -117,6 +118,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
           .AddAssignmentToReference(cuda::provider_option_names::kPreferNHWCMode, info.prefer_nhwc)
           .AddAssignmentToReference(cuda::provider_option_names::kUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
           .AddAssignmentToReference(cuda::provider_option_names::kUseTF32, info.use_tf32)
+          .AddAssignmentToReference(cuda::provider_option_names::kSdpaKernel, info.sdpa_kernel)
           .AddValueParser(
               cuda::provider_option_names::kTunableOpEnable,
               [&info](const std::string& value_str) -> Status {
@@ -170,6 +172,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
       {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
       {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
+      {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)},
   };
 
   return options;
@@ -192,6 +195,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
       {cuda::provider_option_names::kPreferNHWCMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::kUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
       {cuda::provider_option_names::kUseTF32, MakeStringWithClassicLocale(info.use_tf32)},
+      {cuda::provider_option_names::kSdpaKernel, MakeStringWithClassicLocale(info.sdpa_kernel)},
   };
 
   return options;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
index 1cac3d1513698..0efad80f743df 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
@@ -79,6 +79,8 @@ struct CUDAExecutionProviderInfo {
   // By default, enable TF32 to speed up float GEMM/MatMul or cuDNN convolution of float matrices.
   bool use_tf32{true};
 
+  int sdpa_kernel{0};
+
   static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
   static ProviderOptions ToProviderOptions(const OrtCUDAProviderOptionsV2& info);
@@ -91,6 +93,7 @@ struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
     size_t value{0xbc9f1d34};  // seed
 
     // Bits: device_id (16), arena_extend_strategy/cudnn_conv_algo_search (reserved 2), boolean options (1 each)
+    // Do not exceed 32 bits here otherwise some bits will be lost in x86.
     size_t data = static_cast<size_t>(info.device_id) ^
                   (static_cast<size_t>(info.arena_extend_strategy) << 16) ^
                   (static_cast<size_t>(info.cudnn_conv_algo_search) << 18) ^
@@ -109,6 +112,7 @@ struct std::hash<::onnxruntime::CUDAExecutionProviderInfo> {
 
     onnxruntime::HashCombine(info.gpu_mem_limit, value);
     onnxruntime::HashCombine(info.tunable_op.max_tuning_duration_ms, value);
+    onnxruntime::HashCombine(info.sdpa_kernel, value);
 
     // Memory pointers
     onnxruntime::HashCombine(reinterpret_cast<size_t>(info.user_compute_stream), value);
diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
index 288da23f35ec8..9d37a9775872f 100644
--- a/onnxruntime/core/providers/cuda/cuda_kernel.h
+++ b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -94,6 +94,12 @@ class CudaKernel : public OpKernel {
     return provider_->UseTF32();
   }
 
+#ifndef DISABLE_CONTRIB_OPS
+  const AttentionKernelOptions* GetAttentionKernelOptions() const {
+    return provider_->GetAttentionKernelOptions();
+  }
+#endif
+
   tunable::CudaTuningContext* GetTuningContext() const {
     return static_cast<tunable::CudaTuningContext*>(provider_->GetTuningContext());
   }
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 7851da7fa91a3..b1d54e56ded4e 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -226,6 +226,7 @@ struct CUDA_Provider : Provider {
     info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0;
     info.use_ep_level_unified_stream = params->use_ep_level_unified_stream != 0;
     info.use_tf32 = params->use_tf32 != 0;
+    info.sdpa_kernel = params->sdpa_kernel;
 
     return std::make_shared<CUDAProviderFactory>(info);
   }
@@ -260,6 +261,7 @@ struct CUDA_Provider : Provider {
     cuda_options.prefer_nhwc = internal_options.prefer_nhwc;
     cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream;
     cuda_options.use_tf32 = internal_options.use_tf32;
+    cuda_options.sdpa_kernel = internal_options.sdpa_kernel;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {
diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
index a61e917b41e51..f0255d7ece84e 100644
--- a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
@@ -394,8 +394,8 @@ static void RunMultiHeadAttentionTests(AttentionTestData& data, bool disable_cpu
     }
 
 #if USE_MEMORY_EFFICIENT_ATTENTION
-    if (data.sequence_length >= contrib::attention::kMinSeqLenForMemoryEfficientAttentionFp32 ||
-        data.kv_sequence_length >= contrib::attention::kMinSeqLenForMemoryEfficientAttentionFp32) {
+    if (data.sequence_length >= contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32 ||
+        data.kv_sequence_length >= contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32) {
       kernel_type = AttentionKernelType::AttentionKernel_CutlassMemoryEfficientAttention;
       if (!SkipAttentionKernel(data, kernel_type)) {
         RunMultiHeadAttentionKernel(
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
new file mode 100644
index 0000000000000..b2e986f680763
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef DISABLE_CONTRIB_OPS
+
+#include "contrib_ops/cuda/bert/attention_kernel_options.h"
+#include "contrib_ops/cpu/bert/attention_common.h"
+#include "test/util/include/scoped_env_vars.h"
+#include "gtest/gtest.h"
+
+#include <unordered_map>
+#include <string>
+
+using onnxruntime::AttentionKernelOptions;
+using onnxruntime::contrib::attention::AttentionBackend;
+
+namespace onnxruntime {
+namespace test {
+
+TEST(AttentionKernelOptionsTest, NonZeroValue) {
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_TRUE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_FUSED_ATTENTION) | static_cast<int>(AttentionBackend::MATH);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_TRUE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_TRUE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::CUDNN_FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_TRUE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_TRUE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  {
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::TRT_CROSS_ATTENTION) | static_cast<int>(AttentionBackend::TRT_CAUSAL_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_FALSE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_TRUE(options.UseTrtCrossAttention());
+    ASSERT_TRUE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  // Test environment variables are ignored when option value is non-zero
+  // Test default min sequence lengths are zeros
+  {
+    ScopedEnvironmentVariables scoped_env_vars{
+        EnvVarMap{
+            {onnxruntime::contrib::attention::kDisableFlashAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "0"},
+            {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"}}};
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 0);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 0);
+  }
+
+  // Test min sequence lengths can be parsed from environment variables when option value is non-zero
+  {
+    ScopedEnvironmentVariables scoped_env_vars{
+        EnvVarMap{
+            {onnxruntime::contrib::attention::kDisableFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "1"},
+            {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "1"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+            {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+            {onnxruntime::contrib::attention::kMinSeqLenForFlashAttentionPackedQKV, "128"},
+            {onnxruntime::contrib::attention::kMinSeqLenForEfficientAttentionFp32, "256"}}};
+    AttentionKernelOptions options;
+    int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION);
+    options.InitializeOnce(value, false);
+    ASSERT_TRUE(options.UseFlashAttention());
+    ASSERT_FALSE(options.UseEfficientAttention());
+    ASSERT_FALSE(options.UseTrtFusedAttention());
+    ASSERT_FALSE(options.UseCudnnFlashAttention());
+    ASSERT_FALSE(options.UseUnfusedAttention());
+    ASSERT_FALSE(options.UseTrtFlashAttention());
+    ASSERT_FALSE(options.UseTrtCrossAttention());
+    ASSERT_FALSE(options.UseTrtCausalAttention());
+    EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 128);
+    EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 256);
+  }
+}
+
+// Test all environment variables take effect when option value is 0.
+TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+  constexpr int value = 0;
+  ScopedEnvironmentVariables scoped_env_vars{
+      EnvVarMap{
+          {onnxruntime::contrib::attention::kDisableFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "1"},
+          {onnxruntime::contrib::attention::kMinSeqLenForFlashAttentionPackedQKV, "128"},
+          {onnxruntime::contrib::attention::kMinSeqLenForEfficientAttentionFp32, "256"}}};
+  AttentionKernelOptions options;
+  options.InitializeOnce(value, false);
+  ASSERT_TRUE(options.UseFlashAttention());
+  ASSERT_TRUE(options.UseEfficientAttention());
+  ASSERT_TRUE(options.UseTrtFusedAttention());
+  ASSERT_TRUE(options.UseCudnnFlashAttention());
+  ASSERT_TRUE(options.UseUnfusedAttention());
+  ASSERT_TRUE(options.UseTrtFlashAttention());
+  ASSERT_TRUE(options.UseTrtCrossAttention());
+  ASSERT_TRUE(options.UseTrtCausalAttention());
+  ASSERT_TRUE(options.UseTrtCausalAttention());
+  EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(), 128);
+  EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(), 256);
+}
+
+// Test default min sequence lengths when environment variables are not set.
+TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
+  constexpr int value = 0;
+  ScopedEnvironmentVariables scoped_env_vars{
+      EnvVarMap{
+          {onnxruntime::contrib::attention::kDisableFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableTrtFlashAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedSelfAttention, "1"},
+          {onnxruntime::contrib::attention::kDisableFusedCrossAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableCudnnFlashAttention, "0"},
+          {onnxruntime::contrib::attention::kDisableMemoryEfficientAttention, "1"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"},
+          {onnxruntime::contrib::attention::kEnableFusedCausalAttention, "0"}}};
+  AttentionKernelOptions options;
+  options.InitializeOnce(value, false);
+  ASSERT_FALSE(options.UseFlashAttention());
+  ASSERT_FALSE(options.UseEfficientAttention());
+  ASSERT_FALSE(options.UseTrtFusedAttention());
+  ASSERT_FALSE(options.UseCudnnFlashAttention());
+  ASSERT_TRUE(options.UseUnfusedAttention());
+  ASSERT_FALSE(options.UseTrtFlashAttention());
+  ASSERT_FALSE(options.UseTrtCrossAttention());
+  ASSERT_FALSE(options.UseTrtCausalAttention());
+  ASSERT_FALSE(options.UseTrtCausalAttention());
+  EXPECT_EQ(options.MinSeqLenForFlashAttentionPackedQkv(),
+            onnxruntime::contrib::attention::kDefaultMinSeqLenForFlashAttentionPackedQKV);
+  EXPECT_EQ(options.MinSeqLenForEfficientAttentionFp32(),
+            onnxruntime::contrib::attention::kDefaultMinSeqLenForEfficientAttentionFp32);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e4814aa7fc033..892e7de8bb6ed 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -446,6 +446,8 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("use_tf32", ["1", "0"])
 
+                test_get_and_set_option_with_values("sdpa_kernel", ["0", "1", "2"])
+
                 option["gpu_external_alloc"] = "0"
                 option["gpu_external_free"] = "0"
                 option["gpu_external_empty_cache"] = "0"

From 34cd2e8ed8688c42d00adda1d260ac787f76bf29 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 20 Jul 2024 09:35:05 +1000
Subject: [PATCH 25/35] Add CoreML ML Program Resize (#21370)

### Description
<!-- Describe your changes. -->
Add CoreML ML Program Resize
- refactor existing logic to try and simplify and share between
NeuralNetwork and MLProgram checks
- add handling for some new attributes
- antialias and axes - should have been done when setting the CoreML EP
max opset to 21

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Support priority models
---
 .../core/providers/coreml/builders/helper.cc  |  18 +-
 .../core/providers/coreml/builders/helper.h   |   3 +-
 .../coreml/builders/impl/base_op_builder.cc   |   2 +-
 .../coreml/builders/impl/base_op_builder.h    |   6 +
 .../coreml/builders/impl/resize_op_builder.cc | 607 +++++++++++++-----
 .../providers/coreml/builders/model_builder.h |  13 +-
 .../coreml/coreml_execution_provider.cc       |   1 +
 .../builders/impl/resize_op_builder.cc        |  25 +-
 onnxruntime/core/providers/utils.cc           |  16 +
 onnxruntime/core/providers/utils.h            |   5 +
 .../core/providers/xnnpack/tensor/resize.cc   |  21 +-
 .../providers/cpu/tensor/resize_op_test.cc    | 152 ++++-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 13 files changed, 671 insertions(+), 199 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/helper.cc b/onnxruntime/core/providers/coreml/builders/helper.cc
index b8ebbd05a2a20..e1f148fa93e23 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.cc
+++ b/onnxruntime/core/providers/coreml/builders/helper.cc
@@ -50,8 +50,8 @@ bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params,
   }
 }
 
-bool IsInputSupported(const Node& node, const NodeArg& input,
-                      const OpBuilderInputParams& input_params, const logging::Logger& logger) {
+bool IsInputSupported(const Node& node, const NodeArg& input, const OpBuilderInputParams& input_params,
+                      const logging::Logger& logger, bool allow_empty_input) {
   if (!input.Exists()) {
     // optional input that is not provided
     return true;
@@ -84,16 +84,10 @@ bool IsInputSupported(const Node& node, const NodeArg& input,
       return false;
     }
 
-    if (dim == 0) {
-      if (node.OpType() == "Resize" && &input == node.InputDefs()[1]) {
-        // one special case. Resize 'roi' input was originally a required input but is rarely used.
-        // ROI is not supported in the CoreML implementation so we will ignore the value, but is often added
-        // (at least in the unit tests) as an initializer with shape {0}.
-      } else {
-        LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
-                              << ", shape: " << Shape2String(shape);
-        return false;
-      }
+    if (dim == 0 && !allow_empty_input) {
+      LOGS(logger, WARNING) << "CoreML does not support shapes with dimension values of 0. Input:" << input_name
+                            << ", shape: " << Shape2String(shape);
+      return false;
     }
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/helper.h b/onnxruntime/core/providers/coreml/builders/helper.h
index 300de2dedd122..0acaa0dd8a4a3 100644
--- a/onnxruntime/core/providers/coreml/builders/helper.h
+++ b/onnxruntime/core/providers/coreml/builders/helper.h
@@ -30,7 +30,8 @@ OpBuilderInputParams MakeOpBuilderParams(const GraphViewer& graph_viewer,
 const IOpBuilder* GetOpBuilder(const Node& node);
 
 bool IsInputSupported(const Node& node, const NodeArg& node_arg, const OpBuilderInputParams& input_params,
-                      const logging::Logger& logger);
+                      const logging::Logger& logger,
+                      bool allow_empty_input = false);
 
 bool IsNodeSupported(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 83a572f4b60fa..2cae85a0a1c8d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -74,7 +74,7 @@ bool BaseOpBuilder::IsOpSupported(const Node& node, const OpBuilderInputParams&
 bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputParams& input_params,
                                        const logging::Logger& logger) const {
   for (const auto* input : node.InputDefs()) {
-    if (!IsInputSupported(node, *input, input_params, logger)) {
+    if (!IsInputSupported(node, *input, input_params, logger, allow_empty_tensor_as_input_)) {
       return false;
     }
   }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 4a23640d0f34c..071008520fbdc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -28,6 +28,10 @@ class BaseOpBuilder : public IOpBuilder {
   void AddInitializersToSkip(ModelBuilder& /*model_builder*/, const Node& /*node*/) const override {}
 
  protected:
+  explicit BaseOpBuilder(bool allow_empty_tensor_as_input = false)
+      : allow_empty_tensor_as_input_(allow_empty_tensor_as_input) {
+  }
+
   // currently we only support float
   static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
                            const logging::Logger& logger);
@@ -50,6 +54,8 @@ class BaseOpBuilder : public IOpBuilder {
 
   virtual Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                        const logging::Logger& logger) const = 0;
+
+  const bool allow_empty_tensor_as_input_;  // some operators can handle ignoring an empty tensor as input
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 3400f09b4056f..65b5c17f2c6a6 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -1,13 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <math.h>
+#include <cmath>
 
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
+#include "core/providers/utils.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -18,6 +20,11 @@ namespace onnxruntime {
 namespace coreml {
 
 class ResizeOpBuilder : public BaseOpBuilder {
+ public:
+  // allow roi and scales potentially being empty inputs that are ignored during processing
+  ResizeOpBuilder() : BaseOpBuilder(/*allow empty inputs*/ true) {}
+
+ private:
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@@ -29,196 +36,382 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing
   // We only support Resize opset 11+ here
   int GetMinSupportedOpSet(const Node& /* node */) const override { return 11; }
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
-bool GetResizeScales(const InitializedTensorSet& initializers,
-                     const Node& node, std::vector<float>& scales,
-                     const logging::Logger&) {
+std::vector<int64_t> GetAxes(const NodeAttrHelper& helper, size_t input_rank) {
+  auto axes = helper.Get("axes", std::vector<int64_t>{});
+  if (axes.empty()) {
+    axes.resize(input_rank);
+    std::iota(axes.begin(), axes.end(), 0);
+  } else {
+    for (auto& value : axes) {
+      if (value < 0) {
+        value = HandleNegativeAxis(value, input_rank);
+      }
+    }
+  }
+
+  return axes;
+}
+
+bool GetValidatedResizeScales(const GraphViewer& graph_viewer,
+                              const Node& node,
+                              const std::vector<int64_t>& input_shape,
+                              const std::vector<int64_t>& axes,
+                              std::vector<float>& scales,
+                              const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
-  if (input_defs.size() < 3)
+  int64_t input_rank = input_shape.size();
+
+  if (input_shape[input_rank - 2] == -1 || input_shape[input_rank - 1] == -1) {
+    LOGS(logger, VERBOSE) << "Resize with 'scales' requires the H and W dimensions to have fixed values";
     return false;
+  }
 
-  const auto& scales_tensor = *initializers.at(input_defs[2]->Name());
-  if (scales_tensor.dims_size() != 1 || scales_tensor.dims()[0] != 4)
+  const auto* scales_tensor = graph_viewer.GetConstantInitializer(input_defs[2]->Name());
+  if (!scales_tensor) {
+    LOGS(logger, VERBOSE) << "Resize 'scales' input must be a constant initializer";
     return false;
-  Initializer unpacked_tensor(scales_tensor);
+  }
+
+  Initializer unpacked_tensor(*scales_tensor);
   auto scales_data = unpacked_tensor.DataAsSpan<float>();
-  scales = std::vector<float>{scales_data.begin(), scales_data.end()};
+  scales.assign(scales_data.begin(), scales_data.end());
+
+  for (size_t idx = 0, end = axes.size(); idx < end; ++idx) {
+    auto axis = axes[idx];
+    auto scale = scales[idx];
+    if (axis < (input_rank - 2) && scale != 1.0f) {
+      LOGS(logger, VERBOSE) << "Resize only supports resizing the last two axes. Scale of axis " << axis << " is "
+                            << scale;
+      return false;
+    }
+  }
+
   return true;
 }
 
-bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
-                          const Node& node, std::vector<int64_t>& sizes,
-                          const logging::Logger&) {
+bool GetValidatedResizeSizes(const GraphViewer& graph_viewer,
+                             const Node& node,
+                             const std::vector<int64_t>& input_shape,
+                             const std::vector<int64_t>& axes,
+                             std::vector<int64_t>& sizes, const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
-  if (input_defs.size() < 4)
-    return false;
+  int64_t input_rank = input_shape.size();
 
-  const auto& sizes_tensor = *initializers.at(input_defs[3]->Name());
-  if (sizes_tensor.dims_size() != 1 || sizes_tensor.dims()[0] != 4)
+  const auto* sizes_tensor = graph_viewer.GetConstantInitializer(input_defs[3]->Name());
+  if (!sizes_tensor) {
+    LOGS(logger, VERBOSE) << "Resize 'sizes' input must be a constant initializer";
     return false;
-  Initializer unpacked_tensor(sizes_tensor);
+  }
+
+  Initializer unpacked_tensor(*sizes_tensor);
   auto sizes_data = unpacked_tensor.DataAsSpan<int64_t>();
-  sizes = std::vector<int64_t>(sizes_data.begin(), sizes_data.end());
+  sizes.assign(sizes_data.begin(), sizes_data.end());
+
+  for (size_t idx = 0, end = axes.size(); idx < end; ++idx) {
+    auto axis = axes[idx];
+    auto cur_size = input_shape[idx];
+    auto new_size = sizes[idx];
+    if (axis < (input_rank - 2) && cur_size != new_size) {
+      LOGS(logger, VERBOSE) << "Resize only supports resizing the last two axes. Input rank: " << input_rank
+                            << " Change to size of axis " << axis << " from " << cur_size << " to " << new_size;
+      return false;
+    }
+  }
+
   return true;
 }
 }  // namespace
 
 void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
-  // We don't really use ROI here, so add it to skipped list if it's an initializer tensor
-  model_builder.AddInitializerToSkip(node.InputDefs()[1]->Name());  // ROI
-  model_builder.AddInputToSkip(node.InputDefs()[1]->Name());        // ROI
-
-  // We will still add scales to the skipped list even sizes are present
-  // since there is no use of it, we will not process it later
-  model_builder.AddInitializerToSkip(node.InputDefs()[2]->Name());  // scales
-  model_builder.AddInputToSkip(node.InputDefs()[2]->Name());        // scales
-
-  if (node.InputDefs().size() > 3) {
-    model_builder.AddInitializerToSkip(node.InputDefs()[3]->Name());  // sizes
-    model_builder.AddInputToSkip(node.InputDefs()[3]->Name());        // sizes
+  const auto& input_defs = node.InputDefs();
+
+  // In Resize-11 both roi and scales were required even if you were using sizes.
+  // https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Resize-11
+  // From Resize-13 on they're all optional.
+  //
+  // We don't support roi so would never take a node with meaningful roi input. The roi input can however be provided
+  // and is ignored unless coordinate_transformation_mode is set to 'tf_crop_and_resize'.
+  // e.g. our unit tests tend to always provide an empty tensor as roi input instead of as a missing optional input.
+  // Due to this we always call AddInputToSkip on the roi input.
+  //
+  // We require the sizes or scales input to be a constant initializers to take the node (i.e. they won't be an input
+  // to the CoreML model for the partition, so calling AddInputToSkip isn't relevant).
+  // Individual values from scales and sizes are added directly to the layer, so we won't use the initializer.
+  //
+  // That leaves an edge case for Resize-11 where scales could have been provided as an empty input tensor but
+  // we're using a constant initializer for sizes. In this case AddInputToSkip needs to be called for the scales input.
+
+  model_builder.AddInitializerToSkip(input_defs[1]->Name());  // roi
+  model_builder.AddInputToSkip(input_defs[1]->Name());
+
+  if (input_defs[2]->Exists()) {
+    model_builder.AddInitializerToSkip(input_defs[2]->Name());  // scales
+  }
+
+  if (input_defs.size() > 3 && input_defs[3]->Exists()) {
+    model_builder.AddInitializerToSkip(input_defs[3]->Name());  // sizes
+
+    if (node.SinceVersion() < 13) {
+      model_builder.AddInputToSkip(input_defs[2]->Name());  // skip the unused scales input
+    }
   }
 }
 
-Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
-                                              const Node& node,
+Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+  const auto input_defs = node.InputDefs();
+  const auto output_defs = node.OutputDefs();
+  const auto& graph_viewer = model_builder.GetGraphViewer();
+
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Error getting input shape");
+  size_t input_rank = input_shape.size();
+
+  // we know we have either a scales or sizes input so this is safe.
+  // check for sizes first. this handles Resize-11 where scales was a required input but sizes were used if provided.
+  bool using_sizes = input_defs.size() >= 4 && input_defs[3]->Exists();
+  bool using_scales = !using_sizes;
 
-  auto* coreml_upsample = layer->mutable_upsample();
   NodeAttrHelper helper(node);
-  const auto mode = helper.Get("mode", "nearest");
-  if (mode == "linear") {
-    coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_BILINEAR);
-  } else {  // we already checked the mode must be NN or Bilinear in IsOpSupportedImpl
-    coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_NN);
+  const auto& mode = helper.Get("mode", "nearest");
+  bool is_nearest = mode == "nearest";
+  bool is_linear = !is_nearest;
+
+  auto axes = GetAxes(helper, input_rank);
+  std::vector<float> output_scales;
+  std::vector<int64_t> output_sizes;
+  size_t num_scales = 0;
+  size_t num_sizes = 0;
+
+  if (using_scales) {
+    ORT_RETURN_IF_NOT(GetValidatedResizeScales(graph_viewer, node, input_shape, axes, output_scales, logger),
+                      "Error getting validated scales");
+    num_scales = output_scales.size();
+
+    // special case linear downsample.
+    // the CoreML implementation seems to be flaky and gives different outputs on different OS versions.
+    // use bilinear_resize instead. we check in IsOpSupportedImpl that the downsample input is evenly
+    // divisible by the output size so there's no rounding involved.
+    if (is_linear && (output_scales[num_scales - 1] < 1.f || output_scales[num_scales - 2] < 1.f)) {
+      using_scales = false;
+      using_sizes = true;
+      num_sizes = num_scales;
+      output_sizes = input_shape;
+      // only the last two dims have their size changed
+      output_sizes[input_rank - 2] = static_cast<int64_t>(input_shape[input_rank - 2] * output_scales[num_scales - 2]);
+      output_sizes[input_rank - 1] = static_cast<int64_t>(input_shape[input_rank - 1] * output_scales[num_scales - 1]);
+    }
+  } else {
+    ORT_RETURN_IF_NOT(GetValidatedResizeSizes(graph_viewer, node, input_shape, axes, output_sizes, logger),
+                      "Error getting validated sizes");
+    num_sizes = output_sizes.size();
   }
 
-  const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-
-  if (input_defs.size() >= 3 && input_defs[2]->Exists()) {  // use scales
-    std::vector<float> scales;
-    ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[2]));
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(scales[3]));
-  } else {  // we already checked number of inputs in IsOpSupportedImpl
-    std::vector<int64_t> input_shape;
-    ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Error getting input shape");
-    std::vector<int64_t> output_sizes;
-    ORT_RETURN_IF_NOT(GetResizeOutputSizes(initializers, node, output_sizes, logger),
-                      "Error getting resize output_sizes");
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_sizes[2] / input_shape[2]));
-    coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_sizes[3] / input_shape[3]));
-  }
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+
+    std::string_view coreml_op_type;
+    if (using_scales) {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.upsample_bilinear
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.upsample_nearest_neighbor
+      coreml_op_type = is_linear ? "upsample_bilinear" : "upsample_nearest_neighbor";
+    } else {
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resize_bilinear
+      // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resize_nearest_neighbor
+      coreml_op_type = is_linear ? "resize_bilinear" : "resize_nearest_neighbor";
+    }
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+
+    std::string coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
+
+    if (using_scales) {
+      float scale_height = output_scales[num_scales - 2];
+      float scale_width = output_scales[num_scales - 1];
+      AddOperationInput(*op, "scale_factor_height",
+                        model_builder.AddScalarConstant(coreml_op_type, "scale_factor_height", scale_height));
+      AddOperationInput(*op, "scale_factor_width",
+                        model_builder.AddScalarConstant(coreml_op_type, "scale_factor_width", scale_width));
+
+      if (is_linear) {
+        // we only allow these coord modes in the 'is supported' check,
+        //   - half_pixel or pytorch_half_pixel with output size > 1 -> align_corners = false
+        //   - align_corners -> align_corners = true
+        bool align_corners = coord_trans_mode == "align_corners";
+        AddOperationInput(*op, "align_corners",
+                          model_builder.AddScalarConstant(coreml_op_type, "align_corners", align_corners));
+      }
+    } else {
+      assert(using_sizes);
+      int64_t target_height = output_sizes[num_sizes - 2];
+      int64_t target_width = output_sizes[num_sizes - 1];
+
+      AddOperationInput(*op, "target_size_height",
+                        model_builder.AddScalarConstant(coreml_op_type, "target_size_height", target_height));
+      AddOperationInput(*op, "target_size_width",
+                        model_builder.AddScalarConstant(coreml_op_type, "target_size_width", target_width));
+
+      if (is_linear) {
+        // we only allow these coord modes in the 'is supported' check,
+        //   - half_pixel or pytorch_half_pixel with output size > 1 -> UNALIGN_CORNERS
+        //   - align_corners -> STRICT_ALIGN_CORNERS
+        //   - asymmetric -> DEFAULT
+        std::string sampling_mode_value;
+        if (coord_trans_mode == "asymmetric") {
+          sampling_mode_value = "DEFAULT";
+        } else if (coord_trans_mode == "align_corners") {
+          sampling_mode_value = "STRICT_ALIGN_CORNERS";
+        } else {
+          sampling_mode_value = "UNALIGN_CORNERS";
+        }
+
+        AddOperationInput(*op, "sampling_mode",
+                          model_builder.AddScalarConstant(coreml_op_type, "sampling_mode", sampling_mode_value));
+      }
+    }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    AddOperationOutput(*op, *output_defs[0]);
+    model_builder.AddOperation(std::move(op));
+  } else  // NOLINT
+#endif
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    auto* coreml_upsample = layer->mutable_upsample();
+
+    // we already checked the mode must be NN or Bilinear in IsOpSupportedImpl
+    if (is_linear) {
+      coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_BILINEAR);
+    } else {
+      coreml_upsample->set_mode(COREML_SPEC::UpsampleLayerParams_InterpolationMode_NN);
+    }
+
+    if (using_scales) {
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_scales[num_scales - 2]));
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(output_scales[num_scales - 1]));
+    } else {
+      auto scale_height = output_sizes[num_sizes - 2] / input_shape[input_rank - 2];
+      auto scale_width = output_sizes[num_sizes - 1] / input_shape[input_rank - 1];
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(scale_height));
+      coreml_upsample->add_scalingfactor(static_cast<int64_t>(scale_width));
+    }
+
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_defs[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
-  const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors();
 
   std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
+  if (!GetShape(*input_defs[0], input_shape, logger)) {
+    LOGS(logger, VERBOSE) << "Resize: input shape was not known";
     return false;
+  }
 
-  const auto input_size = input_shape.size();
-  if (input_size != 4) {
-    LOGS(logger, VERBOSE) << "Resize only support 4d shape, input is "
-                          << input_size << "d shape";
+  // as we allow empty shapes in the checks done by BaseOpBuilder::HasSupportedInputs we explicitly check for an empty
+  // an empty input here to be consistent.
+  // this should never happen in a real model though as a dim with value 0 (i.e. no input data) would typically be a
+  // dynamic dimension where a previous step had no output (e.g. Loop of zero interations, NonZero with no matches,
+  // NonMaxSupression with no boxes).
+  if (DoesShapeSpecifyZeroElements(input_shape)) {
+    LOGS(logger, VERBOSE) << "Resize input shape has with dimension values of 0 which is not supported.";
     return false;
   }
 
-  {  // check attributes
-    NodeAttrHelper helper(node);
-    const auto mode = helper.Get("mode", "nearest");
-    bool is_linear_resize = mode == "linear";
-    bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
+  const auto input_rank = input_shape.size();
+  if (input_params.create_mlprogram) {
+    if (input_rank < 3 || input_rank > 5) {
+      LOGS(logger, VERBOSE) << "Resize only supports 3D to 5D input. Got: " << input_rank << "D";
       return false;
     }
-
-    const auto exclude_outside = helper.Get("exclude_outside", 0);
-    if (exclude_outside != 0) {
-      LOGS(logger, VERBOSE) << "Resize does not support exclude_outside for now";
+  } else {
+    if (input_rank != 4) {
+      LOGS(logger, VERBOSE) << "Resize only support 4d shape. Got: " << input_rank << "D";
       return false;
     }
+  }
 
-    const auto coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
-    bool using_asymmetric = coord_trans_mode == "asymmetric";
-    if (is_linear_resize) {
-      // TODO, add support of align_corners and half_pixel
-      if (!using_asymmetric) {
-        LOGS(logger, VERBOSE) << "Resize bilinear, unsupported coord_trans_mode, " << coord_trans_mode;
-        return false;
-      }
-    } else {
-      // nearest neighbor resizing
-      // For resize using nearest neighbor, we only support coord_trans_mode == "asymmetric" && nearest_mode == "floor"
-      if (!using_asymmetric) {
-        LOGS(logger, VERBOSE) << "Resize nearest neighbor, unsupported coord_trans_mode, " << coord_trans_mode;
-        return false;
-      }
+  // check attributes
+  NodeAttrHelper helper(node);
 
-      const auto nearest_mode = helper.Get("nearest_mode", "round_prefer_floor");
-      if (nearest_mode != "floor") {
-        LOGS(logger, VERBOSE) << "Resize nearest neighbor, unsupported nearest_mode, " << nearest_mode;
-        return false;
-      }
-    }
+  if (helper.Get("antialias", 0) != 0) {
+    LOGS(logger, VERBOSE) << "Resize does not support antialias";
+    return false;
   }
 
-  {  // scales and sizes (if present) must be initializers
-    if (input_defs.size() < 3) {
-      LOGS(logger, VERBOSE) << "Input scales or sizes of Resize must be known";
-      return false;
-    }
+  const auto& mode = helper.Get("mode", "nearest");
+  bool is_linear = mode == "linear";
+  bool is_nearest = mode == "nearest";
+  if (!is_linear && !is_nearest) {
+    LOGS(logger, VERBOSE) << "Resize unsupported input mode: " << mode;
+    return false;
+  }
 
-    bool using_scales = input_defs.size() >= 3 && input_defs[2]->Exists();
-    // scales
-    if (using_scales && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) {
-      LOGS(logger, VERBOSE) << "scales input of Resize must be a constant initializer";
+  if (is_nearest) {
+    const auto nearest_mode = helper.Get("nearest_mode", "round_prefer_floor");
+    if (nearest_mode != "floor") {
+      LOGS(logger, VERBOSE) << "Resize only supports 'floor' nearest_mode. Got: " << nearest_mode;
       return false;
     }
+  }
 
-    // sizes
-    if (!using_scales &&
-        (input_defs.size() < 4 ||
-         !input_defs[3]->Exists() ||
-         !input_params.graph_viewer.GetConstantInitializer(input_defs[3]->Name()))) {
-      LOGS(logger, VERBOSE) << "sizes input of Resize must be a constant initializer";
-      return false;
-    }
+  if (helper.Get("exclude_outside", 0) != 0) {
+    LOGS(logger, VERBOSE) << "Resize does not support 'exclude_outside'";
+    return false;
+  }
 
-    // We want to check if the scales or sizes are not trying to resize on N/C channels here
-    if (using_scales) {
-      std::vector<float> scales;
-      if (!GetResizeScales(initializers, node, scales, logger))
-        return false;
+  const auto keep_aspect_ratio_policy = helper.Get("keep_aspect_ratio_policy", "stretch");
+  if (keep_aspect_ratio_policy != "stretch") {
+    LOGS(logger, VERBOSE) << "Resize only supports keep_aspect_ratio_policy of 'stretch'. Got "
+                          << keep_aspect_ratio_policy;
+    return false;
+  }
 
-      float scale_n = scales[0];
-      float scale_c = scales[1];
-      if (scale_n != 1.0f || scale_c != 1.0f) {
-        LOGS(logger, VERBOSE) << "Scales of N/C channel should be 1"
-                              << "Resize of N/C channels are not supported"
-                              << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
-        return false;
-      }
+  // check for sizes first. this handles Resize-11 where scales was a required input but sizes were used if provided.
+  bool using_sizes = input_defs.size() >= 4 && input_defs[3]->Exists();
+  bool using_scales = !using_sizes && input_defs.size() >= 3 && input_defs[2]->Exists();
 
-      // For now we only support upscale, so the scale_h and scale_w should be an integer >= 1
-      // TODO support ResizeBilinear
-      float scale_h = scales[2];
-      float scale_w = scales[3];
+  if (!using_scales && !using_sizes) {
+    LOGS(logger, VERBOSE) << "Resize requires 'scales' or 'sizes' input";
+    return false;
+  }
+
+  // 'axes' is from opset 18 on and allows scales or sizes to have entries for the subset of axes.
+  // we fill with default values if necessary so that the processing is consistent across all supported opsets.
+  auto axes = GetAxes(helper, input_rank);
+  std::vector<float> output_scales;
+  std::vector<int64_t> output_sizes;
+
+  // make sure scales/sizes are constant initializers, and are only modifying the last two dimensions of the input.
+  if (using_scales) {
+    if (!GetValidatedResizeScales(input_params.graph_viewer, node, input_shape, axes, output_scales, logger)) {
+      return false;
+    }
 
-      // Onnx spec requires scale to be a positive float, so we are not checking that here
+    size_t num_scales = output_scales.size();
+    float scale_h = output_scales[num_scales - 2];
+    float scale_w = output_scales[num_scales - 1];
+
+    // NeuralNetwork supports upsample only with round numbers.
+    //
+    // ML Program results seem to match if round numbers are involved. When downsampling the scaling value should be
+    // 1 / <factor of input size>. e.g. if input size is 8, scaling factor could be 1/8, 1/4 or 1/2.
+    if (scale_h >= 1.f && scale_w >= 1.f) {
+      // upsample (or no-op with both == 1.f that we won't bother special-casing)
       if (roundf(scale_h) != scale_h) {
         LOGS(logger, VERBOSE) << "Resize: scale_h: " << scale_h << " is not a whole number";
         return false;
@@ -228,33 +421,57 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
         LOGS(logger, VERBOSE) << "Resize: scale_w: " << scale_w << " is not a whole number";
         return false;
       }
-    } else {
-      // we are using sizes
-      std::vector<int64_t> output_sizes;
-      if (!GetResizeOutputSizes(initializers, node, output_sizes, logger))
-        return false;
-
-      if (!IsStaticShape(input_shape)) {
-        LOGS(logger, VERBOSE) << "Input shape with dynamic dimensions is not supported.";
+    } else if (scale_h <= 1.f && scale_w <= 1.f) {
+      // downsample
+      if (input_params.create_mlprogram) {
+        auto h_in = input_shape[input_rank - 2];
+        auto w_in = input_shape[input_rank - 1];
+
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h)) {
+          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_h
+                                << " is not a factor of input height: " << h_in;
+          return false;
+        }
+
+        if (!utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_w
+                                << " is not a factor of input width: " << w_in;
+          return false;
+        }
+
+      } else {
+        LOGS(logger, VERBOSE) << "Resize: downsampling is not supported.";
         return false;
       }
+    } else {
+      LOGS(logger, VERBOSE) << "Resize: scale_h: " << scale_h << " and scale_w: " << scale_w
+                            << " must both be >= 1 or <= 1";
+      return false;
+    }
+  } else {
+    assert(using_sizes);
+
+    if (!GetValidatedResizeSizes(input_params.graph_viewer, node, input_shape, axes, output_sizes, logger)) {
+      return false;
+    }
 
-      auto output_size_n = output_sizes[0];
-      auto output_size_c = output_sizes[1];
-      if (output_size_n != input_shape[0] || output_size_c != input_shape[1]) {
-        LOGS(logger, VERBOSE) << "Output sizes of N/C channel should match the input sizes, "
-                              << "Resize of N/C channels are not supported"
-                              << ", input_size_n, " << input_shape[0] << ", output_size_n, " << output_size_n
-                              << ". input_size_c, " << input_shape[1] << ", output_size_c, " << output_size_c;
+    if (input_params.create_mlprogram) {
+      // no additional requirements
+    } else {
+      if (!IsStaticShape(input_shape)) {
+        // need to convert from sizes to scales when creating the NN layer, so the input H and W are required
+        LOGS(logger, VERBOSE) << "Resize input shape with dynamic dimensions is not supported.";
         return false;
       }
 
-      // For now we only support upscale, so the output_size_h and output_size_w should be an integer >= 1
+      // For now we only support upsample, so the output_size_h and output_size_w should be an integer >= 1
       // TODO support ResizeBilinear
-      auto output_size_h = output_sizes[2];
-      auto output_size_w = output_sizes[3];
-      auto input_size_h = input_shape[2];
-      auto input_size_w = input_shape[3];
+      auto input_size_h = input_shape[input_rank - 2];
+      auto input_size_w = input_shape[input_rank - 1];
+
+      auto num_sizes = output_sizes.size();  // could be smaller than input_rank if axes was used
+      auto output_size_h = output_sizes[num_sizes - 2];
+      auto output_size_w = output_sizes[num_sizes - 1];
 
       // Onnx spec requires output sizes to be a positive integer, so we are not checking that here
       if (output_size_h % input_size_h != 0) {
@@ -271,6 +488,92 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
     }
   }
 
+  std::string coord_trans_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
+  bool using_asymmetric = coord_trans_mode == "asymmetric";
+
+  if (input_params.create_mlprogram) {
+    if (is_nearest) {
+      // Potential CoreML operators we could map to:
+      //
+      // image_resizing.upsample_nearest_neighbor
+      // - mode: nearest
+      // - coordinate_transformation_mode: asymmetric
+      // - 'scales' input
+      //
+      // image_resizing.resize_nearest_neighbor
+      // - mode: nearest
+      // - coordinate_transformation_mode: asymmetric
+      // - 'sizes' input
+      if (!using_asymmetric) {
+        LOGS(logger, VERBOSE) << "Resize with 'mode' of 'nearest' requires 'coordinate_transformation_mode' of "
+                                 "'asymmetric' . Got: "
+                              << coord_trans_mode;
+        return false;
+      }
+    } else {
+      assert(is_linear);
+      // Potential CoreML operators we could map to:
+      //
+      // image_resizing.upsample_bilinear
+      // - mode: linear
+      // - 'scales' input
+      // - coordinate_transformation_mode
+      //   - half_pixel -> align_corners = false
+      //   - align_corners -> align_corners = true
+      //
+      // image_resizing.resize_bilinear
+      // - mode: linear
+      // - 'sizes' input
+      // - coordinate_transformation_mode -> sampling_mode
+      //   - half_pixel -> UNALIGN_CORNERS
+      //   - align_corners -> STRICT_ALIGN_CORNERS
+      //   - asymmetric -> DEFAULT
+      //
+
+      // if output size != 1, coordinate_transformation_mode of pytorch_half_pixel is the same as half_pixel
+      if (coord_trans_mode == "pytorch_half_pixel") {
+        int64_t h_out{0}, w_out{0};
+        if (using_scales) {
+          size_t num_scales = output_scales.size();
+          h_out = std::llround(input_shape[input_rank - 2] * output_scales[num_scales - 2]);
+          w_out = std::llround(input_shape[input_rank - 1] * output_scales[num_scales - 1]);
+        } else {
+          size_t num_sizes = output_sizes.size();
+          h_out = output_sizes[num_sizes - 2];
+          w_out = output_sizes[num_sizes - 1];
+        }
+
+        if (h_out > 1 && w_out > 1) {
+          coord_trans_mode = "half_pixel";
+        }
+      }
+
+      if (coord_trans_mode == "half_pixel" ||
+          coord_trans_mode == "align_corners" ||
+          (using_sizes && coord_trans_mode == "asymmetric")) {
+        // supported
+
+        // FWIW we could calculate (if shape inferencing didn't already) the output sizes and convert a node with
+        // `scales` and co-ord mode of `asymmetric` to having `sizes` input so it's supported.
+      } else {
+        LOGS(logger, VERBOSE) << "Resize with 'mode' of 'linear' requires 'coordinate_transformation_mode' of "
+                                 "'half_pixel', or 'align_corners', or 'asymmetric' with 'sizes' input. Got: "
+                              << coord_trans_mode;
+
+        return false;
+      }
+    }
+  } else {
+    // NeuralNetwork checks
+    if (!using_asymmetric) {
+      // align_corners and half_pixel could be supported in ResizeBilinear but as NeuralNetwork is deprecated
+      // there's no known value to adding that.
+      LOGS(logger, VERBOSE) << "Resize only supports 'asymmetric' coordinate_transformation_mode. Got: "
+                            << coord_trans_mode;
+      return false;
+    }
+  }
+
   return true;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 8f85ab2c09e7c..385588dbfdcb8 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -141,8 +141,17 @@ class ModelBuilder {
   // so we don't do a copy of the original initializer into the model.
   void AddInitializerToSkip(const std::string& tensor_name);
 
-  // There are some input which will not be used, add it to a list which will not
-  // be added to CoreML model, since CoreML does not like input unused
+  /// <summary>
+  /// Skip a non-initializer value, that is not used in the CoreML model, but was an input to a supported node.
+  ///
+  /// This is for a rare edge case where a value is an input to a node but is empty/unused, as the
+  /// CoreML model requires all model inputs to be consumed.
+  /// </summary>
+  /// <remarks>
+  /// The only known use case for this currently is Resize, and that is largely due to how the unit tests are
+  /// setup rather than something you'd expect to see in a real model.
+  /// See ResizeOpBuilder::AddInitializersToSkip for more details.
+  /// </remarks>
   void AddInputToSkip(const std::string& input_name);
 
   const std::string& GetUniqueName(const std::string& base_name);
diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
index 0ba715cc7c6d9..a92fef81ac395 100644
--- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
+++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc
@@ -27,6 +27,7 @@ CoreMLExecutionProvider::CoreMLExecutionProvider(uint32_t coreml_flags)
     : IExecutionProvider{onnxruntime::kCoreMLExecutionProvider},
       coreml_flags_(coreml_flags),
       coreml_version_(coreml::util::CoreMLVersion()) {
+  LOGS_DEFAULT(VERBOSE) << "CoreML version: " << coreml_version_;
   if (coreml_version_ < MINIMUM_COREML_VERSION) {
     LOGS_DEFAULT(ERROR) << "CoreML EP is not supported on this platform.";
   }
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index d75b9cc72ff4b..ef27f6c942f44 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -9,6 +9,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
+#include "core/providers/utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
 #include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
@@ -251,14 +252,34 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const N
       const Initializer unpacked_tensor(*scales);
       auto scales_data = unpacked_tensor.DataAsSpan<float>();
       input_is_nchw = scales_data[1] == 1.0F;
-      float const scale_n = scales_data[0];
-      float const scale_c = input_is_nchw ? scales_data[1] : scales_data[3];
+      const float scale_n = scales_data[0];
+      const float scale_c = input_is_nchw ? scales_data[1] : scales_data[3];
+      const float scale_h = input_is_nchw ? scales_data[2] : scales_data[1];
+      const float scale_w = input_is_nchw ? scales_data[3] : scales_data[2];
+
       if (scale_n != 1.0f || scale_c != 1.0f) {
         LOGS_DEFAULT(VERBOSE) << "Scales of N/C channel should be 1"
                               << "Resize of N/C channels are not supported"
                               << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
         return false;
       }
+
+      // if downsampling the input size must be evenly divisible by the output size to match the onnx output
+      if (scale_h < 1.0f || scale_w < 1.0f) {
+        // we also require input_shape to be known to check
+        auto h_in = input_is_nchw ? input_shape[2] : input_shape[1];
+        auto w_in = input_is_nchw ? input_shape[3] : input_shape[2];
+        if (h_in == 0 || w_in == 0) {
+          LOGS_DEFAULT(VERBOSE) << "Input H and W must be known to downsample with scales";
+          return false;
+        }
+
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
+            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          LOGS_DEFAULT(VERBOSE) << "Input size must be evenly divisible by output size when downsampling";
+          return false;
+        }
+      }
     } else {
       const auto* sizes = graph_viewer.GetConstantInitializer(inputs[3].node_arg.Name());
       if (!sizes) {
diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc
index b2f9d265ca053..747b09e42aa21 100644
--- a/onnxruntime/core/providers/utils.cc
+++ b/onnxruntime/core/providers/utils.cc
@@ -23,5 +23,21 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
   return Status::OK();
 }
 #endif
+
+bool IsScalingByAFactorOfN(int64_t n, float scale) {
+  bool is_factor = false;
+  if (scale > 0.f && scale < 1.f) {
+    const double factor = 1.0 / scale;
+    const double factor_rounded = std::round(factor);
+    constexpr double epsilon = 1.0e-4;  // arbitrarily small enough
+    if (std::abs(factor - factor_rounded) < epsilon) {
+      // result is integer. check if a factor of n
+      const int64_t factor_i = static_cast<int64_t>(factor_rounded);
+      is_factor = n % factor_i == 0;
+    }
+  }
+
+  return is_factor;
+}
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/utils.h b/onnxruntime/core/providers/utils.h
index 8cafdb8c05cc3..9ea8496a02f85 100644
--- a/onnxruntime/core/providers/utils.h
+++ b/onnxruntime/core/providers/utils.h
@@ -15,5 +15,10 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto&
                                                OpKernelContext* context, int output_index);
 #endif
 
+/// <summary>
+/// Check if the reciprocal of 'scale' is a factor of 'n'.
+///   e.g. a scale of 0.5 is 1/2, the reciprocal is 2, and 2 is a factor of any even number.
+/// </summary>
+bool IsScalingByAFactorOfN(int64_t n, float scale);
 }  // namespace utils
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 09666c8039402..c752b5f849808 100644
--- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -11,6 +11,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
+#include "core/providers/utils.h"
 
 namespace onnxruntime {
 namespace xnnpack {
@@ -68,9 +69,27 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
     InlinedVector<float> scale(4, 1.0F);
     if (scale_tensor) {
       const Initializer scale_val(*scale_tensor, node_unit.ModelPath());
-      if (scale_val.DataAsSpan<float>()[1] != 1.0F) {
+      const auto scales = scale_val.DataAsSpan<float>();
+      if (scales[1] != 1.0F) {
         break;
       }
+
+      // downsampling output seems to require the output size to be a factor of the input to match ONNX
+      if (scales[2] < 1.0f || scales[3] < 1.0f) {
+        // we also require input_shape to be known to check
+        int64_t h_in = x_shape->dim(2).dim_value();
+        int64_t w_in = x_shape->dim(3).dim_value();
+        if (h_in < 0 || w_in < 0) {
+          break;
+        }
+
+        float scale_h = scales[2];
+        float scale_w = scales[3];
+        if (!utils::IsScalingByAFactorOfN(h_in, scale_h) ||
+            !utils::IsScalingByAFactorOfN(w_in, scale_w)) {
+          break;
+        }
+      }
     }
 
     if (size_tensor) {
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 496f2213e9d32..111520ef03e26 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -227,28 +227,33 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
-  OpTester test("Resize", 13);
-  std::vector<float> roi{};
-  std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
 
-  test.AddAttribute("mode", "linear");
+    test.AddAttribute("mode", "linear");
 
-  constexpr int64_t N = 1, C = 1, H = 2, W = 4;
-  std::vector<float> X = {
-      1.0f, 2.0f, 3.0f, 4.0f,
-      5.0f, 6.0f, 7.0f, 8.0f};
+    constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f};
 
-  test.AddInput<float>("X", {N, C, H, W}, X);
-  test.AddInput<float>("roi", {0}, roi);
-  test.AddInput<float>("scales", {4}, scales);
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("scales", {4}, scales, scales_in_initializer);
 
-  std::vector<float> Y = {2.66666651f, 4.3333331f};
+    std::vector<float> Y = {2.66666651f, 4.3333331f};
 
-  test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  // QNN: result diff
-  // TRT: Segmentation fault in A100
-  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
+    test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
+    // QNN: result diff
+    // TRT: Segmentation fault in A100
+    std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
+  };
+
+  run_test(false);
+  run_test(true);
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -327,13 +332,14 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
 // For the above test (ResizeOpLinearDownSampleTest_4DBilinear)
 // The output size is [1,1,2,4].*[1,1,0.6,0.6]=[1,1,1,2]
-// NNAPI will recaluclate the scales as the output size divided by input size
+// NNAPI will recalculate the scales as the output size divided by input size
 // scales = [1,1,1,2]./[1,1,2,4] = [1,1,0.5,0.5]
 // See:https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/reference_ops.h
 // So the result of the above example will be different than CPU EP
-// Add the following 2 tests to test with scales valid to NNAPI
+// Add the following 2 tests to test with scales valid to NNAPI.
+// CoreML also doesn't handle a scale that doesn't divide the input size evenly.
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -360,8 +366,38 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
   run_test(true);
 }
 
+// Downsize with factor being an odd number (1/3)
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_OddNumber) {
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{1.0f, 1.0f, (1.f / 3), (1.f / 3)};
+
+    test.AddAttribute("mode", "linear");
+
+    constexpr int64_t N = 1, C = 1, H = 3, W = 6;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f};
+
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("scales", {4}, scales, scales_in_initializer);
+
+    std::vector<float> Y = {8.f, 11.f};
+
+    test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
+  };
+
+  run_test(false);
+  run_test(true);
+}
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizes) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_and_sizes_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -389,8 +425,32 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizes) {
   run_test(true);
 }
 
+// test handling for opset 11. scales input is provided but should be ignored in favor of sizes
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1_WithSizesOpset11) {
+  OpTester test("Resize", 11);
+  std::vector<float> roi{};
+  std::vector<float> scales{};
+  constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+  std::vector<int64_t> sizes{N, C, 1, 2};
+  test.AddAttribute("mode", "linear");
+
+  std::vector<float> X = {
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f};
+
+  test.AddInput<float>("X", {N, C, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {0}, scales);
+  test.AddInput<int64_t>("sizes", {4}, sizes, true);  // add as initializer so CoreML EP can take
+
+  std::vector<float> Y = {3.5f, 5.5f};
+
+  test.AddOutput<float>("Y", sizes, Y);
+  test.Run();
+}
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -416,15 +476,51 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
 
   run_test(false);
 
-#ifdef USE_NNAPI
-  // NNAPI will need the scales as an initializer
+#if defined(USE_NNAPI) || defined(USE_COREML)
+  // NNAPI and CoreML need the scales as an initializer
+  // Also tensor RT EP will fail if scales is an initializer but will pass if it is not
+  run_test(true);
+#endif
+}
+
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners_sizes) {
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
+  auto run_test = [](bool scales_in_initializer) {
+    OpTester test("Resize", 13);
+    std::vector<float> roi{};
+    std::vector<float> scales{};
+    std::vector<int64_t> sizes{1, 1, 1, 2};
+
+    test.AddAttribute("mode", "linear");
+    test.AddAttribute("coordinate_transformation_mode", "align_corners");
+
+    constexpr int64_t N = 1, C = 1, H = 2, W = 4;
+    std::vector<float> X = {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f};
+
+    test.AddInput<float>("X", {N, C, H, W}, X);
+    test.AddInput<float>("roi", {0}, roi);
+    test.AddInput<float>("", {0}, scales);
+    test.AddInput<int64_t>("sizes", {4}, sizes, scales_in_initializer);
+
+    std::vector<float> Y = {1.0f, 4.0f};
+
+    test.AddOutput<float>("Y", {N, C, 1, 2}, Y);
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
+  };
+
+  run_test(false);
+
+#if defined(USE_NNAPI) || defined(USE_COREML)
+  // NNAPI and CoreML will need the scales as an initializer
   // Also tensor RT EP will fail if scales is an initializer but will pass if it is not
   run_test(true);
 #endif
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uint8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -456,7 +552,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -622,7 +718,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
@@ -668,7 +764,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_int8) {
-  // To test NNAPI EP, we need the sclaes/sizes to be in initializers
+  // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
     OpTester test("Resize", 13);
     std::vector<float> roi{};
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 1bbb933f66ba4..3b3790ba06599 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -17,6 +17,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Pow|Only supports cases when both inputs are fp32.|
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
+|ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||

From 5bec52203d980d5925a8f806c7e7c922cf694636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Sat, 20 Jul 2024 06:11:04 +0200
Subject: [PATCH 26/35] [TensorRT] Enable refitting an embedded engine when
 provided as byte stream (#21357)

### Description

This allows refitting an engine using an ONNX file not available on
disk. This is important for encrypted ONNX files on disk.
---
 .../tensorrt/tensorrt_provider_options.h      |  7 ++
 .../tensorrt/onnx_ctx_model_helper.cc         | 24 +++++-
 .../tensorrt/onnx_ctx_model_helper.h          |  6 ++
 .../tensorrt/tensorrt_execution_provider.cc   | 81 +++++++++++++-----
 .../tensorrt/tensorrt_execution_provider.h    |  7 +-
 .../tensorrt_execution_provider_info.cc       | 19 +++++
 .../tensorrt_execution_provider_info.h        |  2 +
 .../tensorrt/tensorrt_provider_factory.cc     |  2 +
 .../core/session/provider_bridge_ort.cc       |  4 +
 .../test/perftest/command_args_parser.cc      |  6 +-
 onnxruntime/test/perftest/ort_test_session.cc | 18 +++-
 .../test/perftest/test_configuration.h        |  1 +
 .../providers/tensorrt/tensorrt_basic_test.cc | 83 +++++++++++++++++--
 13 files changed, 225 insertions(+), 35 deletions(-)

diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index d008058821be3..816eaaf9bc71a 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -16,6 +16,7 @@ struct OrtTensorRTProviderOptionsV2 {
   int device_id{0};                                      // cuda device id.
   int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
   void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
+                                                         // can be updated using: UpdateTensorRTProviderOptionsWithValue
   int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
   int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
   size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
@@ -78,6 +79,12 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_onnx_model_folder_path{nullptr};  // Folder path relative to the current working directory for
                                                     // the ONNX model containing the weights (applicable only when
                                                     // the "trt_weight_stripped_engine_enable" option is enabled)
+  const void* trt_onnx_bytestream{nullptr};         // The byte stream of th original ONNX model containing the weights
+                                                    // (applicable only when the "trt_weight_stripped_engine_enable"
+                                                    // option is enabled)
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
+  size_t trt_onnx_bytestream_size{0};               // size of the byte stream provided as "trt_onnx_bytestream"
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
 
   const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
   int trt_engine_hw_compatible{0};               // Enable hardware compatibility. Default 0 = false, nonzero = true
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index 42788f2960197..ef45d6c85d6a9 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -274,6 +274,9 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
   auto& attrs = node->GetAttributes();
 
   const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  // Only make path checks if model not provided as byte buffer
+  bool make_secure_path_checks = !GetModelPath(graph_viewer).empty();
+
   if (embed_mode) {
     // Get engine from byte stream.
     const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
@@ -284,6 +287,23 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "TensorRT EP could not deserialize engine from binary data");
     }
+
+    if (weight_stripped_engine_refit_) {
+      const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s();
+      std::string placeholder;
+      auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
+                                                           onnx_model_folder_path_,
+                                                           placeholder,
+                                                           make_secure_path_checks,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
+                                                           (*trt_engine_).get(),
+                                                           false /* serialize refitted engine to disk */,
+                                                           detailed_build_log_);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
   } else {
     // Get engine from cache file.
     std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
@@ -343,7 +363,9 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
       auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename,
                                                            onnx_model_folder_path_,
                                                            weight_stripped_engine_cache,
-                                                           true /* path check for security */,
+                                                           make_secure_path_checks,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
                                                            (*trt_engine_).get(),
                                                            true /* serialize refitted engine to disk */,
                                                            detailed_build_log_);
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index 3be08d043da48..3af0143cbf14e 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -52,6 +52,8 @@ class TensorRTCacheModelHandler {
                             std::string compute_capability,
                             bool weight_stripped_engine_refit,
                             std::string onnx_model_folder_path,
+                            const void* onnx_model_bytestream,
+                            size_t onnx_model_bytestream_size,
                             bool detailed_build_log)
       : trt_engine_(trt_engine),
         trt_runtime_(trt_runtime),
@@ -59,6 +61,8 @@ class TensorRTCacheModelHandler {
         compute_capability_(compute_capability),
         weight_stripped_engine_refit_(weight_stripped_engine_refit),
         onnx_model_folder_path_(onnx_model_folder_path),
+        onnx_model_bytestream_(onnx_model_bytestream),
+        onnx_model_bytestream_size_(onnx_model_bytestream_size),
         detailed_build_log_(detailed_build_log) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
@@ -74,6 +78,8 @@ class TensorRTCacheModelHandler {
   std::string compute_capability_;
   bool weight_stripped_engine_refit_;
   std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
   bool detailed_build_log_;
 };  // TRTCacheModelHandler
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 67cbc8f5d6f13..cdbb7bb2a8094 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1333,6 +1333,14 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     engine_cache_enable_ = info.engine_cache_enable;
     weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
     onnx_model_folder_path_ = info.onnx_model_folder_path;
+    onnx_model_bytestream_ = info.onnx_bytestream;
+    onnx_model_bytestream_size_ = info.onnx_bytestream_size;
+    if ((onnx_model_bytestream_ != nullptr && onnx_model_bytestream_size_ == 0) ||
+        (onnx_model_bytestream_ == nullptr && onnx_model_bytestream_size_ != 0)) {
+      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                         "When providing either 'trt_onnx_bytestream_size' or "
+                                         "'trt_onnx_bytestream' both have to be provided"));
+    }
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
@@ -1757,7 +1765,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_ep_context_file_path: " << ep_context_file_path_
                         << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
                         << ", trt_cache_prefix: " << cache_prefix_
-                        << ", trt_engine_hw_compatible: " << engine_hw_compatible_;
+                        << ", trt_engine_hw_compatible: " << engine_hw_compatible_
+                        << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -2597,28 +2606,42 @@ common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_fil
                                                       std::string& onnx_model_folder_path,
                                                       std::string& weight_stripped_engine_cath_path,
                                                       bool path_check,
+                                                      const void* onnx_model_bytestream,
+                                                      size_t onnx_model_bytestream_size,
                                                       nvinfer1::ICudaEngine* trt_engine,
                                                       bool serialize_refitted_engine,
                                                       bool detailed_build_log) {
 #if NV_TENSORRT_MAJOR >= 10
+  bool refit_from_file = onnx_model_bytestream == nullptr && onnx_model_bytestream_size == 0;
   std::filesystem::path onnx_model_path{onnx_model_folder_path};
-  onnx_model_path.append(onnx_model_filename);
-  if (path_check && IsAbsolutePath(onnx_model_path.string())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "For security purpose, the ONNX model path should be set with "
-                           "a relative path, but it is an absolute path: " +
-                               onnx_model_path.string());
-  }
-  if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "The ONNX model path has '..'. For security purpose, it's not "
-                           "allowed to point outside the directory.");
-  }
+  if (refit_from_file) {
+    if (!onnx_model_filename.empty()) {
+      onnx_model_path.append(onnx_model_filename);
+    }
+    if (onnx_model_path.empty()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "The ONNX model was not provided as path. "
+                             "Please use provide an ONNX bytestream to enable refitting the weightless engine.");
+    } else {
+      // check if file path to ONNX is legal
+      if (path_check && IsAbsolutePath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "For security purpose, the ONNX model path should be set with "
+                               "a relative path, but it is an absolute path: " +
+                                   onnx_model_path.string());
+      }
+      if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model path has '..'. For security purpose, it's not "
+                               "allowed to point outside the directory.");
+      }
 
-  if (!std::filesystem::exists(onnx_model_path)) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "The ONNX model " + onnx_model_path.string() +
-                               " does not exist.");
+      if (!(std::filesystem::exists(onnx_model_path) && std::filesystem::is_regular_file(onnx_model_path))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                               "The ONNX model " + onnx_model_path.string() +
+                                   " does not exist.");
+      }
+    }
   }
 
   // weight-stripped engine refit logic
@@ -2626,9 +2649,18 @@ common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_fil
   auto refitter = std::unique_ptr<nvinfer1::IRefitter>(nvinfer1::createInferRefitter(*trt_engine, trt_logger));
   auto parser_refitter = std::unique_ptr<nvonnxparser::IParserRefitter>(
       nvonnxparser::createParserRefitter(*refitter, trt_logger));
-  if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                           "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+  if (refit_from_file) {
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refitting from file on disk: " << onnx_model_path.string();
+    if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string());
+    }
+  } else {
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refitting from byte array";
+    if (!parser_refitter->refitFromBytes(onnx_model_bytestream, onnx_model_bytestream_size)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in the provided bytestraem");
+    }
   }
   if (refitter->refitCudaEngine()) {
     LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Successfully refitted the weight-stripped engine.";
@@ -3212,10 +3244,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     }
 
     if (weight_stripped_engine_refit_) {
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Refit engine from main ONNX file after engine build";
+      char* onnx = string_buf.data();
+      size_t onnx_size = string_buf.size();
       auto status = RefitEngine(model_path_,
                                 onnx_model_folder_path_,
                                 engine_cache_path,
                                 false /* path check for security */,
+                                onnx,
+                                onnx_size,
                                 trt_engine.get(),
                                 true /* serialize refitted engine to disk */,
                                 detailed_build_log_);
@@ -3685,6 +3722,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
                                   onnx_model_folder_path_,
                                   engine_cache_path,
                                   false /* path check for security */,
+                                  onnx_model_bytestream_,
+                                  onnx_model_bytestream_size_,
                                   trt_engine,
                                   true /* serialize refitted engine to disk */,
                                   detailed_build_log_);
@@ -3910,6 +3949,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
                                                            compute_capability_,
                                                            weight_stripped_engine_enable_,
                                                            onnx_model_folder_path_,
+                                                           onnx_model_bytestream_,
+                                                           onnx_model_bytestream_size_,
                                                            detailed_build_log_);
   auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
   if (status != Status::OK()) {
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index b58e86237860c..3f20314438564 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -274,13 +274,12 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool IsGraphCaptured(int graph_annotation_id) const override;
   Status ReplayGraph(int graph_annotation_id) override;
 
-  /**
-   * Refit the weight-stripped engine
-   */
   static common::Status RefitEngine(std::string onnx_model_filename,
                                     std::string& onnx_model_folder_path,
                                     std::string& weight_stripped_engine_cath_path,
                                     bool path_check,
+                                    const void* onnx_model_bytestream,
+                                    size_t onnx_model_bytestream_size,
                                     nvinfer1::ICudaEngine* trt_engine,
                                     bool serialize_refitted_engine,
                                     bool detailed_build_log);
@@ -305,6 +304,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool weight_stripped_engine_enable_ = false;
   bool weight_stripped_engine_refit_ = false;
   std::string onnx_model_folder_path_;
+  const void* onnx_model_bytestream_;
+  size_t onnx_model_bytestream_size_;
   bool build_heuristics_enable_ = false;
   bool sparsity_enable_ = false;
   int builder_optimization_level_ = 3;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 9fe39f5921e1c..63b6d35072290 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -54,6 +54,8 @@ constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
 constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
 constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
+constexpr const char* kONNXBytestream = "trt_onnx_bytestream";
+constexpr const char* kONNXBytestreamSize = "trt_onnx_bytestream_size";
 
 }  // namespace provider_option_names
 }  // namespace tensorrt
@@ -61,6 +63,7 @@ constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
 TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
   TensorrtExecutionProviderInfo info{};
   void* user_compute_stream = nullptr;
+  void* onnx_bytestream = nullptr;
   ORT_THROW_IF_ERROR(
       ProviderOptionsParser{}
           .AddValueParser(
@@ -122,10 +125,20 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineHwCompatible, info.engine_hw_compatible)
+          .AddValueParser(
+              tensorrt::provider_option_names::kONNXBytestream,
+              [&onnx_bytestream](const std::string& value_str) -> Status {
+                size_t address;
+                ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address));
+                onnx_bytestream = reinterpret_cast<void*>(address);
+                return Status::OK();
+              })
+          .AddAssignmentToReference(tensorrt::provider_option_names::kONNXBytestreamSize, info.onnx_bytestream_size)
           .Parse(options));  // add new provider option here.
 
   info.user_compute_stream = user_compute_stream;
   info.has_user_compute_stream = (user_compute_stream != nullptr);
+  info.onnx_bytestream = onnx_bytestream;
   return info;
 }
 
@@ -173,6 +186,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)},
+      {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(info.onnx_bytestream)},
+      {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.onnx_bytestream_size)},
   };
   return options;
 }
@@ -234,6 +249,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)},
+      {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.trt_onnx_bytestream))},
+      {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.trt_onnx_bytestream_size)},
   };
   return options;
 }
@@ -336,5 +353,7 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
   trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
   trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible;
+  trt_provider_options_v2.trt_onnx_bytestream = internal_options.onnx_bytestream;
+  trt_provider_options_v2.trt_onnx_bytestream_size = internal_options.onnx_bytestream_size;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 3b859ea2da466..50b934fd5fcbc 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -34,6 +34,8 @@ struct TensorrtExecutionProviderInfo {
   std::string engine_cache_path{""};
   bool weight_stripped_engine_enable{false};
   std::string onnx_model_folder_path{""};
+  const void* onnx_bytestream{nullptr};
+  size_t onnx_bytestream_size{0};
   bool engine_decryption_enable{false};
   std::string engine_decryption_lib_path{""};
   bool force_sequential_engine_build{false};
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 6430ffab09976..e242788ff389a 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -116,6 +116,8 @@ struct Tensorrt_Provider : Provider {
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
     info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
     info.engine_hw_compatible = options.trt_engine_hw_compatible != 0;
+    info.onnx_bytestream = options.trt_onnx_bytestream;
+    info.onnx_bytestream_size = options.trt_onnx_bytestream_size;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 4f9669a7dcc4c..1d21933e9cba9 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2465,6 +2465,10 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateTensorRTProviderOptionsWithValue,
   if (strcmp(key, "user_compute_stream") == 0) {
     tensorrt_options->has_user_compute_stream = 1;
     tensorrt_options->user_compute_stream = value;
+  } else if (strcmp(key, "trt_onnx_bytestream") == 0) {
+    tensorrt_options->trt_onnx_bytestream = value;
+  } else if (strcmp(key, "trt_onnx_bytestream_size") == 0) {
+    tensorrt_options->trt_onnx_bytestream_size = *reinterpret_cast<size_t*>(value);
   }
   return nullptr;
 #else
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index b7c99fa66a1ea..e6d4e0a94abd3 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -143,6 +143,7 @@ namespace perftest {
       "\t-D [Disable thread spinning]: disable spinning entirely for thread owned by onnxruntime intra-op thread pool.\n"
       "\t-Z [Force thread to stop spinning between runs]: disallow thread from spinning during runs to reduce cpu usage.\n"
       "\t-n [Exit after session creation]: allow user to measure session creation time to measure impact of enabling any initialization optimizations.\n"
+      "\t-l Provide file as binary in memory by using fopen before session creation.\n"
       "\t-h: help\n");
 }
 #ifdef _WIN32
@@ -205,7 +206,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
 
 /*static*/ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
   int ch;
-  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqzn"))) != -1) {
+  while ((ch = getopt(argc, argv, ORT_TSTR("m:e:r:t:p:x:y:c:d:o:u:i:f:F:S:T:C:AMPIDZvhsqznl"))) != -1) {
     switch (ch) {
       case 'f': {
         std::basic_string<ORTCHAR_T> dim_name;
@@ -390,6 +391,9 @@ static bool ParseSessionConfigs(const std::string& configs_string,
       case 'n':
         test_config.run_config.exit_after_session_creation = true;
         break;
+      case 'l':
+        test_config.model_info.load_via_path = true;
+        break;
       case '?':
       case 'h':
       default:
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index ff782da35cbe6..92d732fba2a0a 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -5,6 +5,7 @@
 #include "ort_test_session.h"
 #include <algorithm>
 #include <limits>
+#include <fstream>
 #include <set>
 #include <list>
 #include <type_traits>
@@ -816,8 +817,21 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #endif
   }
 
-  session_ = Ort::Session(env, performance_test_config.model_info.model_file_path.c_str(), session_options);
-
+  if (!performance_test_config.model_info.load_via_path) {
+    session_ = Ort::Session(env, performance_test_config.model_info.model_file_path.c_str(), session_options);
+  } else {
+    std::ifstream file(performance_test_config.model_info.model_file_path.c_str(),
+                       std::ios::binary | std::ios::in | std::ios::ate);
+    if (file.is_open()) {
+      const std::streamsize fsize = file.tellg();
+      file.seekg(0, std::ios_base::beg);
+      std::vector<char> model_bytes(narrow<size_t>(fsize));
+      file.read(model_bytes.data(), fsize);
+      session_ = Ort::Session(env, model_bytes.data(), model_bytes.size(), session_options);
+    } else {
+      ORT_THROW("Model file could not be opened.\n");
+    }
+  }
   size_t output_count = session_.GetOutputCount();
   output_names_.resize(output_count);
   Ort::AllocatorWithDefaultOptions a;
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
index 70a6b12690d5d..209fb55fe93d4 100644
--- a/onnxruntime/test/perftest/test_configuration.h
+++ b/onnxruntime/test/perftest/test_configuration.h
@@ -29,6 +29,7 @@ struct ModelInfo {
   std::basic_string<ORTCHAR_T> model_file_path;
   std::basic_string<ORTCHAR_T> input_file_path;
   std::basic_string<ORTCHAR_T> result_file_path;
+  bool load_via_path = false;
 };
 
 struct MachineConfig {
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 2b5b82d0fc16a..63327a028c6f4 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,6 +122,18 @@ void CreateBaseModel(const PathString& model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
+std::vector<char> ReadFileFromDisk(const PathString& path) {
+  std::fstream file(path.c_str(), std::fstream::binary | std::fstream::in | std::fstream::ate);
+  std::vector<char> file_bytes;
+  if (file.is_open()) {
+    auto fsize = file.tellg();
+    file.seekg(0, std::ios_base::beg);
+    file_bytes.resize(fsize);
+    file.read(file_bytes.data(), fsize);
+  }
+  return file_bytes;
+}
+
 bool HasCacheFileWithPrefix(const std::string& prefix, std::string file_dir = "") {
   std::filesystem::path target_dir;
   if (file_dir.empty()) {
@@ -360,7 +372,8 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
 }
 
 TEST(TensorrtExecutionProviderTest, EPContextNode) {
-  PathString model_name = ORT_TSTR("EPContextNode_test.onnx");
+  std::string model_name_str = "EPContextNode_test.onnx";
+  PathString model_name = ToPathString(model_name_str);
   std::string graph_name = "EPContextNode_test";
   std::string sess_log_id = "EPContextNode_test";
   std::vector<int> dims = {1, 3, 2};
@@ -461,11 +474,11 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
    */
   InferenceSession session_object3{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params3;
-  model_name = ToPathString(params.trt_ep_context_file_path);
+  PathString ctx_model_name = ToPathString(params.trt_ep_context_file_path);
   params3.trt_engine_cache_enable = 1;
   execution_provider = TensorrtExecutionProviderWithOptions(&params3);
   EXPECT_TRUE(session_object3.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object3.Load(model_name);
+  status = session_object3.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object3.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -490,10 +503,10 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
    */
   InferenceSession session_object4{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params4;
-  model_name = ORT_TSTR("./context_model_folder/EPContextNode_test_ctx.onnx");
+  ctx_model_name = ToPathString("./context_model_folder/EPContextNode_test_ctx.onnx");
   execution_provider = TensorrtExecutionProviderWithOptions(&params4);
   EXPECT_TRUE(session_object4.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object4.Load(model_name);
+  status = session_object4.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object4.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -514,7 +527,6 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   params5.trt_dump_ep_context_model = 1;
   params5.trt_ep_context_embed_mode = 1;
   params5.trt_ep_context_file_path = "EP_Context_model_2.onnx";
-  model_name = ORT_TSTR("EPContextNode_test.onnx");
   execution_provider = TensorrtExecutionProviderWithOptions(&params5);
   EXPECT_TRUE(session_object5.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   status = session_object5.Load(model_name);
@@ -528,10 +540,10 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   InferenceSession session_object6{so, GetEnvironment()};
   OrtTensorRTProviderOptionsV2 params6;
   params6.trt_ep_context_embed_mode = 1;
-  model_name = ToPathString(params5.trt_ep_context_file_path);
+  ctx_model_name = ToPathString(params5.trt_ep_context_file_path);
   execution_provider = TensorrtExecutionProviderWithOptions(&params6);
   EXPECT_TRUE(session_object6.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
-  status = session_object6.Load(model_name);
+  status = session_object6.Load(ctx_model_name);
   ASSERT_TRUE(status.IsOK());
   status = session_object6.Initialize();
   ASSERT_TRUE(status.IsOK());
@@ -543,6 +555,61 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   // Y: 1, 3, 3, 2, 2, 2
   // Z: 1, 3, 3, 2, 2, 2
   RunSession(session_object6, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Run context model with ONNX in memory
+   */
+  auto model_bytes = ReadFileFromDisk(model_name);
+  std::string ctx_model_name_str = "EP_Context_model_weight_stripped.onnx";
+  ctx_model_name = ToPathString(ctx_model_name_str);
+  InferenceSession session_object7{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params7;
+  params7.trt_dump_ep_context_model = 1;
+  params7.trt_ep_context_embed_mode = 1;
+  params7.trt_weight_stripped_engine_enable = 1;
+  params7.trt_ep_context_file_path = ctx_model_name_str.c_str();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params7);
+  EXPECT_TRUE(session_object7.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object7.Load(model_bytes.data(), static_cast<int>(model_bytes.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_object7.Initialize();
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object7, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Refit weightless context model with ONNX in memory
+   */
+  auto ctx_model_bytes = ReadFileFromDisk(ctx_model_name);
+  InferenceSession session_object8{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params8;
+  params8.trt_weight_stripped_engine_enable = 1;
+  params8.trt_onnx_bytestream = model_bytes.data();
+  params8.trt_onnx_bytestream_size = model_bytes.size();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params8);
+  EXPECT_TRUE(session_object8.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object8.Load(ctx_model_bytes.data(), static_cast<int>(ctx_model_bytes.size()));
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  status = session_object8.Initialize();
+  std::cerr << status.ErrorMessage();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object8, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 7: Refit weightless context model with ONNX from disk
+   */
+  InferenceSession session_object9{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params9;
+  params9.trt_weight_stripped_engine_enable = 1;
+  params9.trt_onnx_model_folder_path = model_name_str.c_str();
+  execution_provider = TensorrtExecutionProviderWithOptions(&params9);
+  EXPECT_TRUE(session_object9.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object9.Load(ctx_model_bytes.data(), static_cast<int>(ctx_model_bytes.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_object9.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  RunSession(session_object9, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
 }
 
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {

From 11bf3097360d271e8c0d6f26683a8b16477e6c42 Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Fri, 19 Jul 2024 22:55:15 -0700
Subject: [PATCH 27/35] add transform part of the dq matmul tool chain (#21374)

### Description

This is a partial change from
[fajin/qdqmatmulnbitstoolchain](https://github.com/microsoft/onnxruntime/pull/21180).
The original PR is blocked by Web CI failures.

MatMulNBits is a heavily optimized matmul operation. Currently a MatMul
can be converted to MatMulNBits to speed up the model inference.
However, MatMulNBits is an ORT only op. To make the graph compatible
with ONNX ops and utilize MatMulNBits at the same time, we introduce
Q/DQ support for MatMulNBits.

To convert MatMul ops in a model to MatMulNBits:
1. use matmul_4bits_quantizer.py to convert MatMul to DQ + MatMul using
QDQ mode.
2. In ORT session, DQ + MatMul is fused to MatMulNBits

#### Note
MatMulNBits assume B weight is uint4. When no zp is provided, zp
defaults to 8, which is different from DQ. DQ defaults zp to 0 when no
zp provided. And DQ supports int4. Therefore some conversions are
introduced during DQ + MatMul --> MatMulNBits step.

#### Perf
Using QDQ format will increase the model initialization time and memory
consumption. With current implement, model init time increased from ~4s
to ~9s, and memory consumption increased from ~2.8GB to ~4.8GB.
The memory increase is due to
1. in optimizer, after transpose the B weight, a in-memory tensor proto
is created using protobuf's arena.
2. in finalize step, when saving initializer and prepacking, ORT arena
is used to create buffers for initializers.

The memory allocated by arenas cannot be fully deallocated.
If disable ORT arena memory allocation, the memory consumptions of both
QDQ format and original format are ~2.2GB.
The time increase is mainly due to multiple memory copy, but can be
further optimized.

### Motivation and Context
Please see description for details.
---
 .../core/optimizer/graph_transformer_utils.h  |   7 +-
 .../onnxruntime_session_options_config_keys.h |   5 +
 .../core/optimizer/graph_transformer_utils.cc |  26 +-
 .../selectors_actions/qdq_actions.cc          | 173 ++++++-
 .../selectors_actions/qdq_actions.h           |  29 ++
 .../qdq_selector_action_transformer.cc        |  39 +-
 .../qdq_selector_action_transformer.h         |   6 +-
 .../selectors_actions/qdq_selectors.cc        |  85 ++++
 .../selectors_actions/qdq_selectors.h         |  15 +
 .../optimizer/selectors_actions/actions.cc    |   4 +-
 .../optimizer/selectors_actions/actions.h     |   7 +-
 onnxruntime/core/session/inference_session.cc |  14 +-
 onnxruntime/test/common/random_generator.h    |  17 +
 .../optimizer/graph_transform_test_builder.h  |  16 -
 .../qdq_matmulnbits_transformer_test.cc       | 425 ++++++++++++++++++
 onnxruntime/test/optimizer/qdq_test_utils.h   |   2 +-
 16 files changed, 833 insertions(+), 37 deletions(-)
 create mode 100644 onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc

diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
index e609745b5e03f..0bb5c7432f0a7 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -10,6 +10,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/framework/session_options.h"
 #include "core/optimizer/graph_transformer.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/optimizer/rule_based_graph_transformer.h"
@@ -49,7 +50,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& execution_provider /*required by constant folding*/,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {});
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
@@ -78,7 +80,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {});
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index c32e2a77e8453..17ae649e6f174 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -270,3 +270,8 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
 // - "1": Gemm FastMath mode is enabled.
 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+
+// When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
+// Refer to MatMulNBits op schema for more details.
+// If not provided, default is 4.
+static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index e6feb3e7ddbe2..7da65f18ccacb 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -13,6 +13,7 @@
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
 #include "core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 
@@ -187,7 +188,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     TransformerLevel level,
     const SessionOptions& session_options,
     const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable) {
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool disable_quant_qdq =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
@@ -287,6 +289,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                                onnxruntime::kJsExecutionProvider};
       const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
                                                             onnxruntime::kDmlExecutionProvider};
+      const int64_t qdq_matmulnbits_accuracy_level =
+          ParseStringWithClassicLocale<int64_t>(
+              session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                                "4"));
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -300,7 +306,10 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
         if (!qdq_is_int8_allowed) {
           transformers.emplace_back(std::make_unique<QDQS8ToU8Transformer>(avx2_precision_mode, cpu_ep));
         }
-        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed));
+        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed,
+                                                                                 SatApplyContextVariant{},
+                                                                                 qdq_matmulnbits_accuracy_level,
+                                                                                 intra_op_thread_pool));
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
@@ -409,7 +418,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SessionOptions& session_options,
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
-    const InlinedHashSet<std::string>& rules_and_transformers_to_disable) {
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   InlinedVector<std::unique_ptr<GraphTransformer>> transformers;
   const bool saving = std::holds_alternative<SatRuntimeOptimizationSaveContext>(apply_context);
 
@@ -423,12 +433,18 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
       const bool qdq_is_int8_allowed =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQIsInt8Allowed,
                                                             QDQIsInt8Allowed() ? "1" : "0") == "1";
-
+      const int64_t qdq_matmulnbits_accuracy_level =
+          ParseStringWithClassicLocale<int64_t>(
+              session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                                "4"));
       // runtime optimizations only support CPU EP now
       const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
 
       if (!disable_quant_qdq) {
-        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed, apply_context));
+        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(qdq_is_int8_allowed,
+                                                                                 apply_context,
+                                                                                 qdq_matmulnbits_accuracy_level,
+                                                                                 intra_op_thread_pool));
       }
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_ep, apply_context));
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 3497ea4c85523..74fecb0427e14 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -2,10 +2,12 @@
 // Licensed under the MIT License.
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
-
 #include "core/optimizer/qdq_transformer/qdq_util.h"
+#include "core/optimizer/initializer.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/mlas/inc/mlas_q4.h"
+
 namespace onnxruntime {
 namespace QDQ {
 
@@ -273,6 +275,175 @@ Status MatMulReplaceWithQLinear::Run(Graph& graph, const NodesToOptimize& select
   }
 }
 
+DQMatMulToMatMulNBitsAction::DQMatMulToMatMulNBitsAction(int64_t accuracy_level,
+                                                         concurrency::ThreadPool* intra_op_thread_pool)
+    : accuracy_level_{accuracy_level},
+      domain_{kMSDomain},
+      op_type_{"MatMulNBits"},
+      value_moves_{[]() {
+        NTO::NodeLocation target{NTO::NodeType::kTarget, 0};
+        return std::vector<NodeAndMoveInfo>{
+            MoveAndAppend(target, ArgType::kInput, 0, ArgType::kInput),
+            MoveAll(target, ArgType::kOutput)};
+      }()},
+      intra_op_thread_pool_{intra_op_thread_pool} {
+  ORT_ENFORCE(accuracy_level_ >= 0 && accuracy_level_ <= 4, "MatMulNBits accuracy level must be between 0 and 4");
+}
+
+NodeAttributes
+DQMatMulToMatMulNBitsAction::ExtraAttributes(const RuntimeState& runtime_state) const {
+  NodeAttributes extra_attributes;
+
+  const auto* dq_node = runtime_state.selected_nodes.Input(0);
+  auto& attrs = dq_node->GetAttributes();
+  const auto* weight_shape = dq_node->InputDefs()[0]->Shape();
+
+  utils::SetNodeAttribute(utils::MakeAttribute("K", weight_shape->dim(0).dim_value()), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("N", weight_shape->dim(1).dim_value()), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("accuracy_level", accuracy_level_), extra_attributes);
+  // currently only 4bits is supported. In the future, derive bits from DQ's weight type.
+  utils::SetNodeAttribute(utils::MakeAttribute("bits", static_cast<int64_t>(4)), extra_attributes);
+  utils::SetNodeAttribute(utils::MakeAttribute("block_size", attrs.at("block_size").i()), extra_attributes);
+
+  return extra_attributes;
+}
+
+Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph,
+                                                   const NodesToOptimize& selected_nodes,
+                                                   Node& replacement_node) const {
+  const auto* dq_node = selected_nodes.Input(0);
+  const auto* weight_arg = dq_node->InputDefs()[0];
+  const auto* scale_arg = dq_node->InputDefs()[1];
+  const auto* zp_arg = dq_node->InputDefs().size() > 2 ? dq_node->InputDefs()[2] : nullptr;
+  const auto& attrs = dq_node->GetAttributes();
+
+  const ONNX_NAMESPACE::TensorProto* weight_tensor_proto = nullptr;
+  const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = nullptr;
+  const ONNX_NAMESPACE::TensorProto* zp_tensor_proto = nullptr;
+  graph.GetInitializedTensor(weight_arg->Name(), weight_tensor_proto);
+  graph.GetInitializedTensor(scale_arg->Name(), scale_tensor_proto);
+  if (zp_arg) {
+    graph.GetInitializedTensor(zp_arg->Name(), zp_tensor_proto);
+  }
+
+  auto K = weight_arg->Shape()->dim(0).dim_value();
+  auto N = weight_arg->Shape()->dim(1).dim_value();
+  auto block_size = attrs.at("block_size").i();
+  auto quant_num = (K + block_size - 1) / block_size;
+  auto blob_bytes = (block_size + 1) / 2;
+
+  // Unfortunately iterating the source data is complicated, the data maybe in
+  // external file, a raw buffer, or a repeated field depending on the data
+  // type.  UnpackTensor() already contains some of these logic and is closest
+  // to what we need. But it does not handle external data.
+  Initializer weight_src(*weight_tensor_proto, graph.ModelPath());
+  Initializer scale_src(*scale_tensor_proto, graph.ModelPath());
+  std::optional<Initializer> zp_src;
+  Initializer weight_dst(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                         graph.GenerateNodeArgName(weight_arg->Name() + "_T"),
+                         std::vector<int64_t>{N, quant_num, blob_bytes});
+  Initializer scale_dst(static_cast<ONNX_NAMESPACE::TensorProto_DataType>(scale_src.data_type()),
+                        graph.GenerateNodeArgName(scale_arg->Name() + "_T"),
+                        std::vector<int64_t>{N * quant_num});
+  std::optional<Initializer> zp_dst;
+
+  if (zp_tensor_proto) {
+    zp_src.emplace(*zp_tensor_proto, graph.ModelPath());
+    zp_dst.emplace(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                   graph.GenerateNodeArgName(zp_arg->Name() + "_T"),
+                   std::vector<int64_t>{N * ((quant_num + 1) / 2)});
+  } else if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
+    zp_dst.emplace(ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                   graph.GenerateNodeArgName("fused_DQ_MatMul_zero_point_T"),
+                   std::vector<int64_t>{N * ((quant_num + 1) / 2)});
+  }
+
+  if (scale_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+      MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<float>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<float>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    } else {
+      MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<float>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<float>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    }
+  } else {
+    if (weight_src.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+      MlasQDQTransposeBlockwiseQuantized<MLFloat16, 4, true>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<MLFloat16>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<MLFloat16>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+
+    } else {
+      MlasQDQTransposeBlockwiseQuantized<MLFloat16, 4, false>(
+          weight_src.DataAsByteSpan().data(),
+          scale_src.data<MLFloat16>(),
+          zp_src ? zp_src->DataAsByteSpan().data() : nullptr,
+          weight_dst.data<uint8_t>(),
+          scale_dst.data<MLFloat16>(),
+          zp_dst ? zp_dst->data<uint8_t>() : nullptr,
+          true,
+          static_cast<int>(K),
+          static_cast<int>(N),
+          static_cast<int>(block_size),
+          intra_op_thread_pool_);
+    }
+  }
+
+  ONNX_NAMESPACE::TensorProto weight_T_tp;
+  ONNX_NAMESPACE::TensorProto scale_T_tp;
+  std::optional<ONNX_NAMESPACE::TensorProto> zp_T_tp;
+
+  // TODO(fajin): external_data to memory location to avoid arena allocation
+  // https://github.com/microsoft/onnxruntime/pull/12465
+  weight_dst.ToProto(weight_T_tp);
+  scale_dst.ToProto(scale_T_tp);
+  if (zp_dst) {
+    zp_T_tp.emplace();
+    zp_dst->ToProto(zp_T_tp.value());
+  }
+
+  auto& input_defs = replacement_node.MutableInputDefs();
+  input_defs.push_back(&graph_utils::AddInitializer(graph, weight_T_tp));
+  replacement_node.MutableInputArgsCount().push_back(1);
+  input_defs.push_back(&graph_utils::AddInitializer(graph, scale_T_tp));
+  replacement_node.MutableInputArgsCount().push_back(1);
+
+  if (zp_T_tp) {
+    input_defs.push_back(&graph_utils::AddInitializer(graph, zp_T_tp.value()));
+    replacement_node.MutableInputArgsCount().push_back(1);
+  }
+
+  return Status::OK();
+}
+
 static std::vector<NodeAndMoveInfo> GetGemmMoveInfo(bool does_q_node_exist) {
   NTO::NodeLocation dq_A{NTO::NodeType::kInput, 0};
   NTO::NodeLocation dq_B{NTO::NodeType::kInput, 1};
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
index 8179a030508a5..47821619db65a 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
@@ -3,7 +3,12 @@
 
 #pragma once
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "core/optimizer/selectors_actions/actions.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
@@ -76,6 +81,30 @@ struct MatMulReplaceWithQLinear : public Action {
   BinaryReplaceWithQLinear qlinear_matmul_replacer_;
 };
 
+// used together with DQMatMulNodeGroupSelector, which does the sanity check
+struct DQMatMulToMatMulNBitsAction : public ReplaceWithNew {
+  DQMatMulToMatMulNBitsAction(int64_t accuracy_level,
+                              concurrency::ThreadPool* intra_op_thread_pool);
+
+ private:
+  std::string OpType(const RuntimeState&) const override { return op_type_; }
+
+  std::string Domain(const RuntimeState&) const override { return domain_; }
+
+  NodeAttributes ExtraAttributes(const RuntimeState&) const override;
+
+  std::vector<NodeAndMoveInfo> ValueMoves(const RuntimeState&) const override { return value_moves_; }
+
+  // transpose initializers, and add to the MatMulNBits inputs
+  Status ProcessNewNode(Graph&, const NodesToOptimize&, Node&) const override;
+
+  const int64_t accuracy_level_;
+  const std::string domain_;
+  const std::string op_type_;
+  const std::vector<NodeAndMoveInfo> value_moves_;
+  concurrency::ThreadPool* intra_op_thread_pool_;
+};
+
 struct GemmReplaceWithQuant : public Action {
   GemmReplaceWithQuant();
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 80ead8f8c68d6..17e66a3953b97 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -228,6 +228,30 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i
 #endif
 }
 
+void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_registry,
+                                int64_t qdq_matmulnbits_accuracy_level,
+                                concurrency::ThreadPool* intra_op_thread_pool) {
+  // 2 nodes. DQ -> MatMul. DQ is the second input to MatMul.
+  // DQ's weight is int4/uint4. DQ's scale is float/float16.
+  // DQ is block-quantized along axis 0, with block_size >= 16 and as 2's power.
+  const std::string action_name{"DQMatMulToMatMulNBits"};
+
+  std::unique_ptr<Action> action =
+      std::make_unique<QDQ::DQMatMulToMatMulNBitsAction>(qdq_matmulnbits_accuracy_level,
+                                                         intra_op_thread_pool);
+
+#if !defined(ORT_MINIMAL_BUILD)
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>();
+  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
+                                                         {{"MatMul", {}}},
+                                                         std::move(selector),
+                                                         std::move(action));
+
+#else
+  qdq_selector_action_registry.RegisterAction(action_name, std::move(action));
+#endif
+}
+
 void GemmQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
   // 3 to 5 nodes. 0=DQ A, 1=DQ B, 2=DQ C(optional), 3=Gemm, 4=Q Y(optional)
   // Replace with QGemm
@@ -271,7 +295,9 @@ void WhereQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
 #endif
 }
 
-SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
+SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed,
+                                                    int64_t qdq_matmulnbits_accuracy_level,
+                                                    concurrency::ThreadPool* intra_op_thread_pool) {
   SelectorActionRegistry qdq_selector_action_registry;
   SplitQDQRules(qdq_selector_action_registry);
   DropQDQNodesRules(qdq_selector_action_registry);
@@ -283,17 +309,22 @@ SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
   MatMulQDQRules(qdq_selector_action_registry, is_int8_allowed);
   GemmQDQRules(qdq_selector_action_registry);
   WhereQDQRules(qdq_selector_action_registry);
+  DQMatMulToMatMulNBitsRules(qdq_selector_action_registry,
+                             qdq_matmulnbits_accuracy_level,
+                             intra_op_thread_pool);
 
   return qdq_selector_action_registry;
 }
 
 }  // namespace
 
-QDQSelectorActionTransformer::QDQSelectorActionTransformer(
-    bool is_int8_allowed, const SatApplyContextVariant& apply_context)
+QDQSelectorActionTransformer::QDQSelectorActionTransformer(bool is_int8_allowed,
+                                                           const SatApplyContextVariant& apply_context,
+                                                           int64_t qdq_matmulnbits_accuracy_level,
+                                                           concurrency::ThreadPool* intra_op_thread_pool)
     : SelectorActionTransformer{
           "QDQSelectorActionTransformer",
-          CreateSelectorActionRegistry(is_int8_allowed),
+          CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level, intra_op_thread_pool),
           apply_context,
           // this transformer is only compatible with the CPU and DML EP
           {kCpuExecutionProvider, kDmlExecutionProvider}} {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
index 1780923f3f273..ba636f76d1900 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
@@ -5,6 +5,7 @@
 
 #include "core/optimizer/selectors_actions/selector_action_transformer.h"
 #include "core/mlas/inc/mlas.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
@@ -21,7 +22,10 @@ Transformer that fuses QDQ and fp32 ops into quantized ops.
 */
 class QDQSelectorActionTransformer : public SelectorActionTransformer {
  public:
-  QDQSelectorActionTransformer(bool is_int8_allowed, const SatApplyContextVariant& apply_context = {});
+  QDQSelectorActionTransformer(bool is_int8_allowed,
+                               const SatApplyContextVariant& apply_context = {},
+                               int64_t qdq_matmulnbits_accuracy_level = 4,
+                               concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 09705f61c82ce..6e93445c7c5c7 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -414,6 +414,91 @@ bool MatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer,
   }
 }
 
+bool DQMatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer,
+                                      const Node& node,
+                                      const std::vector<const Node*>& dq_nodes,
+                                      const std::vector<const Node*>& q_nodes) const {
+  // Should not have any Q nodes
+  if (!q_nodes.empty()) {
+    return false;
+  }
+
+  const auto& graph = graph_viewer.GetGraph();
+
+  // MatMul has only 1 DQ input and the DQ must have 1 output edge and not be a graph output
+  if (dq_nodes.size() != 1 || !optimizer_utils::CheckOutputEdges(graph, *dq_nodes[0], 1)) {
+    return false;
+  }
+
+  // DQ must be MatMul's the second input
+  if (node.InputDefs()[1] != dq_nodes[0]->OutputDefs()[0]) {
+    return false;
+  }
+
+  // DQ weight/zero points types are int4/uint4, scales/output types are float or float16
+  const auto* weight_arg = dq_nodes[0]->InputDefs()[0];
+  const auto* scale_arg = dq_nodes[0]->InputDefs()[1];
+  const auto* zero_point_arg = dq_nodes[0]->InputDefs().size() == 3 ? dq_nodes[0]->InputDefs()[2] : nullptr;
+  int32_t dt_weight = weight_arg->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_scales = scale_arg->TypeAsProto()->tensor_type().elem_type();
+  if (dt_scales != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT &&
+      dt_scales != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+    return false;
+  }
+
+  if (!Is4BitIntType(dt_weight)) {
+    return false;
+  }
+
+  // DQ is blockwise quantized along axis 0, and block_size must be 2's power and >= 16
+  const auto& dq_attrs = dq_nodes[0]->GetAttributes();
+  if (const auto a_iter = dq_attrs.find("axis");
+      a_iter == dq_attrs.end() || a_iter->second.i() != 0) {
+    return false;
+  }
+
+  const auto a_iter = dq_attrs.find("block_size");
+  if (a_iter == dq_attrs.end()) {
+    return false;
+  }
+
+  auto block_size = a_iter->second.i();
+  if (block_size < 16 || ((block_size - 1) & block_size)) {
+    return false;
+  }
+
+  // weight, scale and zero points (if exists) must be constants
+  const auto* weight_tensor_proto = graph.GetConstantInitializer(weight_arg->Name(), true);
+  const auto* scale_tensor_proto = graph.GetConstantInitializer(scale_arg->Name(), true);
+  const auto* zp_tensor_proto = zero_point_arg ? graph.GetConstantInitializer(zero_point_arg->Name(), true) : nullptr;
+
+  if (!weight_tensor_proto || !scale_tensor_proto) {
+    return false;
+  }
+
+  if (zero_point_arg && !zp_tensor_proto) {
+    return false;
+  }
+
+  // weight, scale and zero points (if exists) must have the rank 2
+  if (weight_tensor_proto->dims_size() != 2 ||
+      scale_tensor_proto->dims_size() != 2 ||
+      (zp_tensor_proto && zp_tensor_proto->dims_size() != 2)) {
+    return false;
+  }
+
+  // check weight, scale and zero points (if exists) shapes
+  if ((weight_tensor_proto->dims()[0] + block_size - 1) / block_size != scale_tensor_proto->dims()[0] ||
+      weight_tensor_proto->dims()[1] != scale_tensor_proto->dims()[1] ||
+      (zp_tensor_proto &&
+       (zp_tensor_proto->dims()[0] != scale_tensor_proto->dims()[0] ||
+        zp_tensor_proto->dims()[1] != scale_tensor_proto->dims()[1]))) {
+    return false;
+  }
+
+  return true;
+}
+
 bool GemmNodeGroupSelector::Check(const GraphViewer& graph_viewer,
                                   const Node& node,
                                   const std::vector<const Node*>& dq_nodes,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 1a2a620acb480..491a15b62cb03 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -204,6 +204,14 @@ class MatMulNodeGroupSelector : public NodeGroupSelector {
   bool allow_4bit_;
 };
 
+// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
+class DQMatMulNodeGroupSelector : public NodeGroupSelector {
+ private:
+  bool Check(const GraphViewer& graph_viewer, const Node& node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 // Input: DQ nodes for A, B and optional C
 // Output: optional Q node for Y
 class GemmNodeGroupSelector : public NodeGroupSelector {
@@ -358,6 +366,13 @@ class MatMulSelector : public BaseSelector {
                                                                allow_16bit, allow_4bit)) {}
 };
 
+// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
+class DQMatMulToMatMulNBitsSelector : public BaseSelector {
+ public:
+  explicit DQMatMulToMatMulNBitsSelector(gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<DQMatMulNodeGroupSelector>(), compatible_providers) {}
+};
+
 // Input: DQ nodes for A, B and optional C
 // Output: optional Q node for Y
 class GemmSelector : public BaseSelector {
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.cc b/onnxruntime/core/optimizer/selectors_actions/actions.cc
index c8d5acbf66b78..bb4033afedc49 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.cc
@@ -102,12 +102,14 @@ static Status CreateReplacementNode(Graph& graph,
 
 Status ReplaceWithNew::Run(Graph& graph, const NodesToOptimize& selected_nodes) const {
   const RuntimeState runtime_state{graph, selected_nodes};
+  Node* replacement{};
   ORT_RETURN_IF_ERROR(CreateReplacementNode(graph, selected_nodes,
                                             OpType(runtime_state),
                                             Domain(runtime_state),
                                             ExtraAttributes(runtime_state),
                                             ValueMoves(runtime_state),
-                                            /* only_update_dest_definitions */ false, nullptr));
+                                            /* only_update_dest_definitions */ false, &replacement));
+  ORT_RETURN_IF_ERROR(ProcessNewNode(graph, selected_nodes, *replacement));
   return node_remover_.Run(graph, selected_nodes);
 }
 
diff --git a/onnxruntime/core/optimizer/selectors_actions/actions.h b/onnxruntime/core/optimizer/selectors_actions/actions.h
index 9384bfa7027cd..465ae38565b15 100644
--- a/onnxruntime/core/optimizer/selectors_actions/actions.h
+++ b/onnxruntime/core/optimizer/selectors_actions/actions.h
@@ -158,6 +158,12 @@ struct ReplaceWithNew : public Action {
   // specifies how the inputs and outputs for the replaced nodes are moved to the new node
   virtual std::vector<NodeAndMoveInfo> ValueMoves(const RuntimeState&) const = 0;
 
+  // For the changes that cannot be done by simply moving node args around, use this method to make
+  // additional changes to the new node and the graph. e.g., DQMatMulToMatMulNBitsAction transposes
+  // the second weight of MatMul ops and create new node args.
+  // Note: This method is only used in Run(), but not in RunForSave().
+  virtual Status ProcessNewNode(Graph&, const NodesToOptimize&, Node&) const { return Status::OK(); }
+
   RemoveNodes node_remover_;
 };
 
@@ -187,5 +193,4 @@ struct ReplaceWithNewFixed : public ReplaceWithNew {
   const NodeAttributes extra_attrs_;
   const std::vector<NodeAndMoveInfo> value_moves_;
 };
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index f0eed91d70440..3fd6e84e0e5ce 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1609,7 +1609,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 Status ApplyOrtFormatModelRuntimeOptimizations(
     onnxruntime::Graph& graph, const logging::Logger& logger, const SessionOptions& session_options,
-    const InlinedHashSet<std::string>& optimizers_to_disable, const IExecutionProvider& cpu_ep) {
+    const InlinedHashSet<std::string>& optimizers_to_disable, const IExecutionProvider& cpu_ep,
+    concurrency::ThreadPool* intra_op_thread_pool) {
   bool modified = false;
 
   for (int level = static_cast<int>(TransformerLevel::Level2);
@@ -1617,7 +1618,7 @@ Status ApplyOrtFormatModelRuntimeOptimizations(
        ++level) {
     const auto transformers = optimizer_utils::GenerateTransformersForMinimalBuild(
         static_cast<TransformerLevel>(level), session_options, SatRuntimeOptimizationLoadContext{}, cpu_ep,
-        optimizers_to_disable);
+        optimizers_to_disable, intra_op_thread_pool);
 
     for (const auto& transformer : transformers) {
       ORT_RETURN_IF_ERROR(transformer->Apply(graph, modified, logger));
@@ -2005,7 +2006,8 @@ common::Status InferenceSession::Initialize() {
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
       ORT_RETURN_IF_ERROR_SESSIONID_(
-          ApplyOrtFormatModelRuntimeOptimizations(graph, *session_logger_, session_options_, optimizers_to_disable_, cpu_ep));
+          ApplyOrtFormatModelRuntimeOptimizations(graph, *session_logger_, session_options_, optimizers_to_disable_,
+                                                  cpu_ep, GetIntraOpThreadPoolToUse()));
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
     }
 
@@ -3167,7 +3169,8 @@ common::Status InferenceSession::AddPredefinedTransformers(
 
         if (use_full_build_optimizations) {
           return optimizer_utils::GenerateTransformers(level, session_options_, cpu_ep,
-                                                       optimizers_to_disable_);
+                                                       optimizers_to_disable_,
+                                                       GetIntraOpThreadPoolToUse());
         } else {
           const auto sat_context =
               minimal_build_optimization_handling ==
@@ -3176,7 +3179,8 @@ common::Status InferenceSession::AddPredefinedTransformers(
                         record_runtime_optimization_produced_op_schema_fn}}
                   : SatApplyContextVariant{SatDirectApplicationContext{}};
           return optimizer_utils::GenerateTransformersForMinimalBuild(level, session_options_, sat_context, cpu_ep,
-                                                                      optimizers_to_disable_);
+                                                                      optimizers_to_disable_,
+                                                                      GetIntraOpThreadPoolToUse());
         }
       }();
 
diff --git a/onnxruntime/test/common/random_generator.h b/onnxruntime/test/common/random_generator.h
index 9ab4a82463d51..9bc50ce88ef16 100644
--- a/onnxruntime/test/common/random_generator.h
+++ b/onnxruntime/test/common/random_generator.h
@@ -12,6 +12,7 @@
 #include "core/common/common.h"
 #include "core/common/optional.h"
 #include "core/common/type_utils.h"
+#include "core/framework/int4.h"
 #include "test/util/include/test_random_seed.h"
 
 namespace onnxruntime {
@@ -108,6 +109,22 @@ class RandomValueGenerator {
     return val;
   }
 
+  template <typename TInt4>
+  typename std::enable_if<
+      std::is_same_v<TInt4, Int4x2> || std::is_same_v<TInt4, UInt4x2>,
+      std::vector<TInt4>>::type
+  Uniform(gsl::span<const int64_t> dims, TInt4 min, TInt4 max) {
+    using UnpackedType = typename TInt4::UnpackedType;
+    std::vector<UnpackedType> data_int8 = Uniform<UnpackedType>(dims, min.GetElem(0), max.GetElem(0));
+    std::vector<TInt4> data(TInt4::CalcNumInt4Pairs(data_int8.size()));
+    for (size_t i = 0; i < data_int8.size(); i++) {
+      size_t r = i >> 1;
+      size_t c = i & 0x1;
+      data[r].SetElem(c, data_int8[i]);
+    }
+    return data;
+  }
+
   // Gaussian distribution for float
   template <typename TFloat>
   typename std::enable_if<
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 6214094a26c4f..b9af675afe74d 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -117,22 +117,6 @@ class ModelTestBuilder {
     return MakeInput<bool>(shape, data);
   }
 
-  template <typename TInt4>
-  typename std::enable_if<
-      std::is_same_v<TInt4, Int4x2> || std::is_same_v<TInt4, UInt4x2>,
-      NodeArg*>::type
-  MakeInputInt4(const std::vector<int64_t>& shape, typename TInt4::UnpackedType min, typename TInt4::UnpackedType max) {
-    using UnpackedType = typename TInt4::UnpackedType;
-    std::vector<UnpackedType> data_int8 = rand_gen_.Uniform<UnpackedType>(shape, min, max);
-    std::vector<TInt4> data(TInt4::CalcNumInt4Pairs(data_int8.size()));
-    for (size_t i = 0; i < data_int8.size(); i++) {
-      size_t r = i >> 1;
-      size_t c = i & 0x1;
-      data[r].SetElem(c, data_int8[i]);
-    }
-    return MakeInput<TInt4>(shape, data);
-  }
-
   template <typename T>
   NodeArg* MakeInput(const std::optional<std::vector<int64_t>>& shape,
                      std::optional<std::string> input_name = std::nullopt) {
diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
new file mode 100644
index 0000000000000..3d117794104fa
--- /dev/null
+++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
@@ -0,0 +1,425 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <type_traits>
+
+#include "core/common/span_utils.h"
+#include "core/framework/int4.h"
+#include "core/graph/node_attr_utils.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+
+#include "test/compare_ortvalue.h"
+#include "test/test_environment.h"
+#include "test/framework/test_utils.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/optimizer/graph_transform_test_builder.h"
+#include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+#include "gtest/gtest.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4127)
+#endif  // #if defined(_MSC_VER)
+
+struct QDQOpKeys {
+  const char* quantize_linear;
+  const char* dequantize_linear;
+};
+
+constexpr QDQOpKeys GetQDQOpKeys(bool use_contrib_qdq) {
+  if (use_contrib_qdq) {
+    return {"com.microsoft.QuantizeLinear", "com.microsoft.DequantizeLinear"};
+  }
+  return {"QuantizeLinear", "DequantizeLinear"};
+}
+
+namespace onnxruntime {
+namespace test {
+
+#if !defined(DISABLE_CONTRIB_OPS)
+
+// Input1   Input2
+//   |        |
+//    \      DQ
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
+                                   const std::vector<int64_t>& input2_shape,
+                                   const int64_t axis,
+                                   const int64_t block_size,
+                                   int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{input2_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {input2_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {input2_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input1_arg, dq_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+}
+
+//        Input2
+//           |
+//    DQ     /
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
+                                     const std::vector<int64_t>& input2_shape,
+                                     const int64_t axis,
+                                     const int64_t block_size,
+                                     int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{weight_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {dq_output, input2_arg}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1);
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
+}
+
+// Input1
+//   |
+//    \      DQ
+//     \    /
+//     MatMul
+//       |
+//     output
+template <typename T, bool use_zp>
+void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input1_shape,
+                                               const std::vector<int64_t>& weight_shape,
+                                               const int64_t axis,
+                                               const int64_t block_size,
+                                               int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+    NodeArg* weight_arg = nullptr;
+
+    // add DQ
+    if constexpr (std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>) {
+      weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
+    } else {
+      weight_arg = builder.MakeInitializer(weight_shape,
+                                           std::numeric_limits<T>::min(),
+                                           std::numeric_limits<T>::max());
+    }
+
+    auto* dq_output = builder.MakeIntermediate();
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+
+    auto scale_shape = std::vector<int64_t>{weight_shape};
+    scale_shape[axis] = (scale_shape[axis] + block_size - 1) / block_size;
+    auto* scale_arg = builder.MakeInitializer(scale_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      NodeArg* zp_arg;
+      if constexpr (std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>) {
+        zp_arg = builder.MakeInitializer(scale_shape, T(0, 0), T(2, 0));
+      } else {
+        zp_arg = builder.MakeInitializer<T>(scale_shape, 0, 2);
+      }
+
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg, zp_arg}, {dq_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input_arg, dq_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 1);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 1);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_TypeShapeMismatch<int8_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int8_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint8_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint8_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int16_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int16_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint16_t, true>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<uint16_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<int32_t, false>({12, 37}, {37, 12}, 0, 16, 0);
+}
+
+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  // block size too small
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 8, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 8, 0);
+  // block size not 2's power
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 17, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 17, 0);
+  // not axis 0
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 37}, 1, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 37}, 1, 16, 0);
+  // not rank 2
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
+}
+
+//  Input1
+//    |      DQ
+//     \    /
+//     MatMul
+//       |      DQ
+//        \    /
+//        MatMul
+//          |
+//        output
+template <typename T, bool use_zp>
+typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
+RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
+                     const std::vector<int64_t>& weight1_shape,
+                     const std::vector<int64_t>& weight2_shape,
+                     const int64_t axis,
+                     const int64_t block_size,
+                     int64_t accuracy_level) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    // add DQ
+    NodeAttributes attrs;
+    utils::SetNodeAttribute(utils::MakeAttribute("axis", axis), attrs);
+    utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
+    auto scale1_shape = std::vector<int64_t>{weight1_shape};
+    auto scale2_shape = std::vector<int64_t>{weight2_shape};
+    scale1_shape[axis] = (scale1_shape[axis] + block_size - 1) / block_size;
+    scale2_shape[axis] = (scale2_shape[axis] + block_size - 1) / block_size;
+
+    auto* weight1_arg = builder.MakeInitializer(weight1_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* weight2_arg = builder.MakeInitializer(weight2_shape, T(T::min_val, 0), T(T::max_val, 0));
+    auto* dq1_output = builder.MakeIntermediate();
+    auto* dq2_output = builder.MakeIntermediate();
+    auto* matmul1_output = builder.MakeIntermediate();
+
+    auto* scales1_arg = builder.MakeInitializer(scale1_shape, 8.0f, 12.0f);
+    auto* scales2_arg = builder.MakeInitializer(scale2_shape, 8.0f, 12.0f);
+    if constexpr (use_zp) {
+      auto* zp1_arg = builder.MakeInitializer(scale1_shape, T(0, 0), T(2, 0));
+      auto* zp2_arg = builder.MakeInitializer(scale2_shape, T(0, 0), T(2, 0));
+      builder.AddNode("DequantizeLinear", {weight1_arg, scales1_arg, zp1_arg}, {dq1_output}, "", &attrs);
+      builder.AddNode("DequantizeLinear", {weight2_arg, scales2_arg, zp2_arg}, {dq2_output}, "", &attrs);
+    } else {
+      builder.AddNode("DequantizeLinear", {weight1_arg, scales1_arg}, {dq1_output}, "", &attrs);
+      builder.AddNode("DequantizeLinear", {weight2_arg, scales2_arg}, {dq2_output}, "", &attrs);
+    }
+
+    builder.AddNode("MatMul", {input_arg, dq1_output}, {matmul1_output});
+    builder.AddNode("MatMul", {matmul1_output, dq2_output}, {output_arg});
+  };
+
+  auto check_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
+    EXPECT_EQ(op_to_count["MatMul"], 0);
+    EXPECT_EQ(op_to_count["com.microsoft.MatMulNBits"], 2);
+    EXPECT_EQ(op_to_count[qdq_keys.quantize_linear], 0);
+    EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
+  };
+
+  std::function<void(SessionOptions&)> add_session_options_fn{};
+  if (accuracy_level >= 0) {
+    add_session_options_fn = [accuracy_level](SessionOptions& sess_opts) {
+      std::ignore = sess_opts.config_options.AddConfigEntry(kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel,
+                                                            std::to_string(accuracy_level).c_str());
+    };
+  }
+
+  TransformerTester(build_test_case,
+                    check_graph,
+                    TransformerLevel::Level1,
+                    TransformerLevel::Level2,
+                    21 /*opset_version*/,
+                    1e-5 /*per_sample_tolerance*/,
+                    1e-5 /*relative_per_sample_tolerance*/,
+                    nullptr,
+                    add_session_options_fn);
+}
+
+TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0);
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
+}
+
+#endif  // !defined(DISABLE_CONTRIB_OPS)
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 862408f31f004..52ac2a2541a79 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -517,7 +517,7 @@ GetQDQTestCaseFn BuildQDQSplitTestCase(const std::vector<int64_t>& input_shape,
     NodeArg* input_arg = nullptr;
 
     if constexpr (std::is_same_v<InputType, Int4x2> || std::is_same_v<InputType, UInt4x2>) {
-      input_arg = builder.MakeInputInt4<InputType>(input_shape, InputType::min_val, InputType::max_val);
+      input_arg = builder.MakeInput(input_shape, InputType(InputType::min_val, 0), InputType(InputType::max_val, 0));
       dq_zp = InputType(static_cast<std::byte>(InputType::max_val / 2));
       q_zp = OutputType(static_cast<std::byte>(OutputType::max_val / 2));
     } else {

From a6c5e2cd20dd890f416806e0afbb3b5968030f4d Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 22 Jul 2024 10:41:08 -0700
Subject: [PATCH 28/35] [CUDA] FusedMHARunnerFP16v2 thread-safe (#21420)

### Description
- [x] Rewrite FusedMHARunnerFP16v2 to make it thread-safe.
- [x] Add multi-threading tests

Previously, the kernel parameters params is stored as a member of mha
runner, which means that different threads might change the params at
the same time and impacts the other threads.

For example, if batch_size and seq_len was changed by another thread to
larger values in setup(...), buffer overrun might happen in run(...)
because a kernel could read/write memory out of range of allocated
buffers.

In new implementation, I change the api and remove mutable member
variables to make it thread safe. Below is summary of change:

Before:
```
class FusedMHARunnerFP16v2::mhaImpl {
   void setup(int seq_len, int batch_size) {
      // change scalar params
   }

   void run(input, output) {
      // change params for input and output pointers
      // launch kernel using params
   }

   Fused_multihead_attention_params_v2 params; // mutable, not thread-safe
}
```

After:
```
class FusedMHARunnerFP16v2::FmhaImpl {
   void setup(int seq_len, int batch_size, Fused_multihead_attention_params_v2& params) {
      // change params
   }

   void run(params, input, output) {
      // change params with input and output pointers
      // launch kernel using params
   }
}
```

### Motivation and Context
https://github.com/microsoft/onnxruntime/issues/18854
https://github.com/microsoft/onnxruntime/issues/21413
---
 .../contrib_ops/cuda/bert/attention.cc        |  12 +-
 .../contrib_ops/cuda/bert/attention_impl.cu   |  10 +-
 .../cuda/bert/multihead_attention.cc          |   8 +-
 .../contrib_ops/cuda/bert/packed_attention.cc |  14 +-
 .../cuda/bert/packed_attention_impl.cu        |   7 +-
 .../bert/packed_multihead_attention_impl.cu   |   6 +-
 .../mha_runner.cu                             | 230 ++++++-------
 .../mha_runner.h                              | 119 +++----
 .../test/python/transformers/benchmark_mha.py |  49 ++-
 .../test/python/transformers/test_mha.py      | 313 +++++++++++++++++-
 10 files changed, 534 insertions(+), 234 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index cacd65313ebcc..3b7f980ba1881 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -149,8 +149,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                                        nullptr == relative_position_bias &&
                                        parameters.past_sequence_length == 0 &&
                                        parameters.hidden_size == parameters.v_hidden_size &&
-                                       FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                                          enable_trt_flash_attention_, true);
+                                       FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                                         enable_trt_flash_attention_, true);
         if (use_causal_fused_runner) {
           // Here we assume that num_heads, head_size and is_unidirectional does not change for an Attention node.
           if (nullptr == fused_fp16_runner_.get()) {
@@ -171,8 +171,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                               nullptr == present &&
                               nullptr == relative_position_bias &&
                               parameters.hidden_size == parameters.v_hidden_size &&
-                              FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                                 enable_trt_flash_attention_, false);
+                              FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                                enable_trt_flash_attention_, false);
 
       if (use_fused_runner) {
         // Here we assume that num_heads, head_size and is_unidirectional does not change for an Attention node.
@@ -184,8 +184,8 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
         }
 
         // In case some kernel not loaded due to shared memory limit, we need to double check here.
-        const int S = fused_fp16_runner_->getSFromMaxSeqLen(sequence_length);
-        if (fused_fp16_runner_->isValid(S)) {
+        const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(sequence_length);
+        if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
           fused_runner = fused_fp16_runner_.get();
         }
       }
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 150079cdf157a..997493acd9cb7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -245,12 +245,10 @@ Status FusedTrtSelfAttention(
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(data.fused_runner);
 
-  const int S = causal ? sequence_length : fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
+  const int s = causal ? sequence_length : fused_fp16_runner->NormalizeSequenceLength(sequence_length);
 
   // B = 2 * batch_size when there is padding in input, and B = batch_size when padding is removed.
-  const int B = (nullptr == data.mask_index ? batch_size : 2 * batch_size);
-
-  fused_fp16_runner->setup(S, B);
+  const int b = (nullptr == data.mask_index ? batch_size : 2 * batch_size);
 
   if (!causal) {
     assert(data.qkv_format == AttentionQkvFormat::QKV_BSN3H);
@@ -261,12 +259,12 @@ Status FusedTrtSelfAttention(
       packed_qkv = data.query;
     }
 
-    fused_fp16_runner->run(packed_qkv, sequence_offset, data.output, stream);
+    fused_fp16_runner->Run(b, s, packed_qkv, sequence_offset, data.output, stream);
     DUMP_TENSOR("fused output", data.output,
                 batch_size, sequence_length, parameters.num_heads, parameters.v_head_size);
   } else {
     assert(data.qkv_format == AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH);
-    fused_fp16_runner->run(data.gemm_buffer, sequence_offset, data.output, stream);
+    fused_fp16_runner->Run(b, s, data.gemm_buffer, sequence_offset, data.output, stream);
     DUMP_TENSOR("fused causal output", data.output,
                 batch_size, sequence_length, parameters.num_heads, parameters.v_head_size);
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index b96140f3897f9..663bd020ddac7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -193,8 +193,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                           (nullptr == key_padding_mask || is_mask_1d_seq_len) &&
                           parameters.hidden_size == parameters.v_hidden_size &&
                           parameters.sequence_length == parameters.kv_sequence_length &&
-                          FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length,
-                                                             enable_trt_flash_attention_, false);
+                          FusedMHARunnerFP16v2::IsSupported(sm, parameters.head_size, sequence_length,
+                                                            enable_trt_flash_attention_, false);
   if (use_fused_runner) {
     // Here we assume that num_heads and head_size does not change for a MultiHeadAttention node.
     if (nullptr == fused_fp16_runner_.get()) {
@@ -206,8 +206,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
     }
 
     // In case some kernel not loaded due to shared memory limit, we need to double check here.
-    const int S = fused_fp16_runner_->getSFromMaxSeqLen(sequence_length);
-    if (fused_fp16_runner_->isValid(S)) {
+    const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(sequence_length);
+    if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
       fused_runner = fused_fp16_runner_.get();
     }
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
index a1149ddbf99f5..d1c6993d48e62 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -55,11 +55,11 @@ MHARunner* TrtFusedAttention<T>::GetFusedRunner(const cudaDeviceProp& device_pro
 
   // Check whether we can use fused kernel
   int sm = device_prop.major * 10 + device_prop.minor;
-  bool is_fMHA_supported = FusedMHARunnerFP16v2::is_supported(sm,
-                                                              parameters.head_size,
-                                                              parameters.sequence_length,
-                                                              enable_trt_flash_attention_,
-                                                              false /*causal*/);
+  bool is_fMHA_supported = FusedMHARunnerFP16v2::IsSupported(sm,
+                                                             parameters.head_size,
+                                                             parameters.sequence_length,
+                                                             enable_trt_flash_attention_,
+                                                             false /*causal*/);
 
   if (!is_fMHA_supported) {
     return fused_runner;
@@ -72,8 +72,8 @@ MHARunner* TrtFusedAttention<T>::GetFusedRunner(const cudaDeviceProp& device_pro
   }
 
   // In case some kernel not loaded due to shared memory limit, we need to double check here.
-  const int S = fused_fp16_runner_->getSFromMaxSeqLen(parameters.sequence_length);
-  if (fused_fp16_runner_->isValid(S)) {
+  const int normalized_seq_len = fused_fp16_runner_->NormalizeSequenceLength(parameters.sequence_length);
+  if (fused_fp16_runner_->IsValid(normalized_seq_len)) {
     fused_runner = fused_fp16_runner_.get();
   }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
index db9f30c25c013..ac2cb5165a94c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention_impl.cu
@@ -459,10 +459,9 @@ Status FusedScaledDotProductAttention(
                          parameters.token_count, stream);
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(fused_runner);
-  const int S = fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
-  fused_fp16_runner->setup(S, batch_size);
-
-  fused_fp16_runner->run(data.workspace, data.cumulative_sequence_length, data.output, stream);
+  const int normalized_seq_len = fused_fp16_runner->NormalizeSequenceLength(sequence_length);
+  fused_fp16_runner->Run(batch_size, normalized_seq_len,
+                         data.workspace, data.cumulative_sequence_length, data.output, stream);
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index 3e168189be3d5..b4ca0194b08bc 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -575,10 +575,8 @@ Status FusedAttentionTrt(
   }
 
   FusedMHARunnerFP16v2* fused_fp16_runner = reinterpret_cast<FusedMHARunnerFP16v2*>(fused_runner);
-  const int S = fused_fp16_runner->getSFromMaxSeqLen(sequence_length);
-  fused_fp16_runner->setup(S, batch_size);
-
-  fused_fp16_runner->run(qkv, data.cumulative_sequence_length, data.output, stream);
+  const int normalized_seq_len = fused_fp16_runner->NormalizeSequenceLength(sequence_length);
+  fused_fp16_runner->Run(batch_size, normalized_seq_len, qkv, data.cumulative_sequence_length, data.output, stream);
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
index 4a4e3eeecf642..8af28e874729a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+// Modifications: Update interface and implmentation to be thread-safe
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/fused_multihead_attention_v2.h"
 #include "contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/flash_attention/fmha_flash_attention.h"
@@ -34,28 +38,28 @@ void set_alpha_fp16(uint32_t& alpha, float norm) {
   alpha = temp.u32;
 }
 
-class FusedMHARunnerFP16v2::mhaImpl {
+class FusedMHARunnerFP16v2::FmhaImpl {
  public:
-  mhaImpl(FusedMHARunnerFP16v2* interface)
-      : interface(interface),
-        sm(interface->mSm),
-        xmmaKernel(getXMMAKernelsV2(DATA_TYPE_FP16, sm)) {
+  FmhaImpl(FusedMHARunnerFP16v2* interface, int sm)
+      : interface_(interface),
+        sm_(sm),
+        xmma_kernel_(getXMMAKernelsV2(DATA_TYPE_FP16, sm)) {
     ORT_ENFORCE((sm == kSM_70 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89),
                 "Unsupported architecture");
 
-    flash_attention_kernel = nullptr;
-    if (interface->mEnableFlashAttention) {
-      flash_attention_kernel = get_flash_attention_kernels(DATA_TYPE_FP16, sm);
+    flash_kernel_ = nullptr;
+    if (interface_->enable_flash_attention_) {
+      flash_kernel_ = get_flash_attention_kernels(DATA_TYPE_FP16, sm);
     }
-
-    params.clear();
   }
 
-  ~mhaImpl() {}
+  ~FmhaImpl() {}
 
-  void setup(const int seq_len, const int B) {
-    // For bert and vit, use flash attention when sequence length is larger than the threshold.
-    use_flash_attention = is_flash_attention(seq_len);
+  void Setup(Fused_multihead_attention_params_v2& params,
+             int sequence_length,  // normalized sequence length
+             int batch_size,
+             bool& use_flash_attention) const {
+    use_flash_attention = UseFlashAttention(sequence_length);
 
     params.force_unroll = use_flash_attention;
 
@@ -67,27 +71,27 @@ class FusedMHARunnerFP16v2::mhaImpl {
       warps_m = 4;
       warps_n = 1;
     } else {
-      if (sm == 70) {
-        if (seq_len == 64 || seq_len == 96) {
+      if (sm_ == 70) {
+        if (sequence_length == 64 || sequence_length == 96) {
           warps_m = 2;
           warps_n = 2;
-        } else if (seq_len == 128) {
+        } else if (sequence_length == 128) {
           warps_m = 1;
           warps_n = 4;
-        } else if (seq_len == 256 || seq_len == 384) {
+        } else if (sequence_length == 256 || sequence_length == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
           ORT_ENFORCE(false, "Unsupported sequence length");
         }
       } else {
-        if (seq_len == 32 || seq_len == 64 || seq_len == 96 || seq_len == 128) {
+        if (sequence_length == 32 || sequence_length == 64 || sequence_length == 96 || sequence_length == 128) {
           warps_m = 2;
           warps_n = 2;
-        } else if (seq_len == 192 || seq_len == 256) {
+        } else if (sequence_length == 192 || sequence_length == 256) {
           warps_m = 1;
           warps_n = 4;
-        } else if (seq_len == 384) {
+        } else if (sequence_length == 384) {
           warps_m = 1;
           warps_n = 8;
         } else {
@@ -97,11 +101,11 @@ class FusedMHARunnerFP16v2::mhaImpl {
     }
 
     // The number of threads per CTA.
-    threads_per_cta = warps_m * warps_n * warps_k * 32;
+    size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M dimension.
-    xmmas_m = (seq_len + 16 * warps_m - 1) / (16 * warps_m);
+    size_t xmmas_m = (sequence_length + 16 * warps_m - 1) / (16 * warps_m);
 
-    const float scale_bmm1 = interface->mScale;
+    const float scale_bmm1 = interface_->scale_;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
 
@@ -109,20 +113,21 @@ class FusedMHARunnerFP16v2::mhaImpl {
     set_alpha_fp16(params.scale_softmax, scale_softmax);
     set_alpha_fp16(params.scale_bmm2, scale_bmm2);
 
-    params.b = B;
-    params.h = interface->mNumHeads;
-    params.s = seq_len;
-    params.d = interface->mHeadSize;
+    params.b = batch_size;
+    params.h = interface_->num_heads_;
+    params.s = sequence_length;
+    params.d = interface_->head_size_;
 
-    params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
+    params.qkv_stride_in_bytes = 3 * interface_->num_heads_ * interface_->head_size_ * sizeof(half);
     params.packed_mask_stride_in_bytes = xmmas_m * threads_per_cta * sizeof(uint32_t);
-    params.o_stride_in_bytes = interface->mNumHeads * interface->mHeadSize * sizeof(half);
-
-    has_causal_mask = false;
+    params.o_stride_in_bytes = interface_->num_heads_ * interface_->head_size_ * sizeof(half);
   }
 
-  void setup_causal_masked_fmha(const int seq_len, const int B) {
-    const float scale_bmm1 = interface->mScale;
+  void SetupCausal(Fused_multihead_attention_params_v2& params,
+                   int sequence_length,  // normalized sequence length
+                   int batch_size,
+                   bool& use_flash_attention) const {
+    const float scale_bmm1 = interface_->scale_;
     const float scale_softmax = 1.f;  // Seems to be only required for int8
     const float scale_bmm2 = 1.f;
 
@@ -130,16 +135,17 @@ class FusedMHARunnerFP16v2::mhaImpl {
     set_alpha_fp16(params.scale_softmax, scale_softmax);
     set_alpha_fp16(params.scale_bmm2, scale_bmm2);
 
-    params.b = B;
-    params.h = interface->mNumHeads;
-    params.s = seq_len;
-    params.d = interface->mHeadSize;
+    params.b = batch_size;
+    params.h = interface_->num_heads_;
+    params.s = sequence_length;
+    params.d = interface_->head_size_;
+
+    params.qkv_stride_in_bytes = 3 * interface_->num_heads_ * interface_->head_size_ * sizeof(half);
+    params.o_stride_in_bytes = interface_->num_heads_ * interface_->head_size_ * sizeof(half);
 
-    params.qkv_stride_in_bytes = 3 * interface->mNumHeads * interface->mHeadSize * sizeof(half);
-    params.o_stride_in_bytes = interface->mNumHeads * interface->mHeadSize * sizeof(half);
+    use_flash_attention = interface_->enable_flash_attention_;
 
-    // fallback to original fmha_v2 when head_size <= 64 and seq_len <- 128
-    use_flash_attention = interface->mEnableFlashAttention;
+    // fallback to original fmha_v2 when head_size <= 64 and sequence_length <= 128
     if (params.d <= 64 && params.s <= 128) {
       use_flash_attention = false;
       // get max sequence length
@@ -152,97 +158,87 @@ class FusedMHARunnerFP16v2::mhaImpl {
 
     // set flags
     params.force_unroll = use_flash_attention;
-    has_causal_mask = true;
   }
 
-  void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) {
+  void Run(Fused_multihead_attention_params_v2& params,
+           const void* input,
+           const void* cu_seqlens,
+           void* output,
+           cudaStream_t stream,
+           bool use_flash_attention,
+           bool has_causal_mask) const {
     params.qkv_ptr = const_cast<void*>(input);
     params.o_ptr = output;
     params.cu_seqlens = static_cast<int*>(const_cast<void*>(cu_seqlens));
 
-    if (use_flash_attention && flash_attention_kernel != nullptr && !has_causal_mask) {
-      flash_attention_kernel->run(params, stream);
+    if (use_flash_attention && flash_kernel_ != nullptr && !has_causal_mask) {
+      flash_kernel_->run(params, stream);
     } else {
-      xmmaKernel->run(params, stream, use_flash_attention, has_causal_mask);
+      xmma_kernel_->run(params, stream, use_flash_attention, has_causal_mask);
     }
 
     CUDA_CALL_THROW(cudaPeekAtLastError());
   }
 
-  bool isValid(int s) const {
-    if (is_flash_attention(s)) {
-      return (flash_attention_kernel != nullptr) && flash_attention_kernel->isValid(s);
+  bool IsValid(int sequence_length) const {
+    if (UseFlashAttention(sequence_length)) {
+      return (flash_kernel_ != nullptr) && flash_kernel_->isValid(sequence_length);
     }
 
-    return xmmaKernel->isValid(s);
+    return xmma_kernel_->isValid(sequence_length);
   }
 
-  int getSFromMaxSeqLen(const int max_seq_len) const {
-    if (is_flash_attention(max_seq_len)) {
+  int NormalizeSequenceLength(int max_seq_len) const {
+    if (UseFlashAttention(max_seq_len)) {
       return max_seq_len;
     }
 
-    int seq_len = max_seq_len;
+    int sequence_length = max_seq_len;
     if (max_seq_len <= 32) {
-      seq_len = (sm == 70) ? 64 : 32;
+      sequence_length = (sm_ == 70) ? 64 : 32;
     } else if (max_seq_len <= 64) {
-      seq_len = 64;
+      sequence_length = 64;
     } else if (max_seq_len <= 96) {
-      seq_len = 96;
+      sequence_length = 96;
     } else if (max_seq_len <= 128) {
-      seq_len = 128;
+      sequence_length = 128;
     } else if (max_seq_len <= 192) {
-      seq_len = (sm == 70) ? 256 : 192;
+      sequence_length = (sm_ == 70) ? 256 : 192;
     } else if (max_seq_len <= 256) {
-      seq_len = 256;
+      sequence_length = 256;
     } else if (max_seq_len <= 384) {
-      seq_len = 384;
+      sequence_length = 384;
     }
 
-    return seq_len;
+    return sequence_length;
   }
 
  protected:
-  bool is_flash_attention(const int seq_len) const {
-    ORT_ENFORCE(interface->mHasCausalMask == false);
-    return interface->mEnableFlashAttention && seq_len >= kMinSequenceLengthFlashAttention;
+  bool UseFlashAttention(int sequence_length) const {
+    ORT_ENFORCE(interface_->is_causal_ == false);
+    return interface_->enable_flash_attention_ && sequence_length >= kMinSequenceLengthFlashAttention;
   }
 
  private:
-  FusedMHARunnerFP16v2* interface;
-  Fused_multihead_attention_params_v2 params;
-  int sm;
-  const FusedMultiHeadAttentionXMMAKernelV2* xmmaKernel;
-  const FusedMultiHeadFlashAttentionKernel* flash_attention_kernel;
-  size_t xmmas_m;
-  size_t threads_per_cta;
-  bool use_flash_attention = false;
-  bool has_causal_mask = false;
+  FusedMHARunnerFP16v2* interface_;
+  int sm_;
+  const FusedMultiHeadAttentionXMMAKernelV2* xmma_kernel_;
+  const FusedMultiHeadFlashAttentionKernel* flash_kernel_;
 };
 
-FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(const int numHeads,
-                                           const int headSize,
-                                           const int sm,
-                                           bool causal_mask,
+FusedMHARunnerFP16v2::FusedMHARunnerFP16v2(int num_heads,
+                                           int head_size,
+                                           int sm,
+                                           bool causal,
                                            bool enable_flash_attention,
-                                           const float scale)
-    : MHARunner(numHeads, headSize, 2, causal_mask, scale),
-      mSm(sm),
-      mEnableFlashAttention(enable_flash_attention),
-      pimpl(new mhaImpl(this)) {
+                                           float scale)
+    : MHARunner(num_heads, head_size, causal, scale),
+      enable_flash_attention_(enable_flash_attention),
+      impl_(new FmhaImpl(this, sm)) {
 }
 
-void FusedMHARunnerFP16v2::setup(const int seq_len, const int B) {
-  MHARunner::setup(seq_len, B);
-  if (mHasCausalMask) {
-    pimpl->setup_causal_masked_fmha(seq_len, B);
-  } else {
-    pimpl->setup(seq_len, B);
-  }
-}
-
-bool FusedMHARunnerFP16v2::is_supported(int sm, int head_size, int sequence_length,
-                                        bool enable_flash_attention, bool causal) {
+bool FusedMHARunnerFP16v2::IsSupported(int sm, int head_size, int sequence_length,
+                                       bool enable_flash_attention, bool causal) {
   if (causal) {
     if (!(sm == kSM_70 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)) {
       return false;
@@ -284,34 +280,44 @@ bool FusedMHARunnerFP16v2::is_supported(int sm, int head_size, int sequence_leng
   return sequence_length <= max_sequence_length;
 }
 
-size_t FusedMHARunnerFP16v2::getWorkspaceSize() const {
-  return 0;
-}
+void FusedMHARunnerFP16v2::Run(int batch_size,
+                               int normalized_sequence_length,
+                               const void* input,
+                               const void* cu_seqlens,
+                               void* output,
+                               cudaStream_t stream) const {
+  Fused_multihead_attention_params_v2 params;
+  bool use_flash_attention = false;
+  if (is_causal_) {
+    impl_->SetupCausal(params, normalized_sequence_length, batch_size, use_flash_attention);
+  } else {
+    impl_->Setup(params, normalized_sequence_length, batch_size, use_flash_attention);
+  }
 
-void FusedMHARunnerFP16v2::run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) {
-  pimpl->run(input, cu_seqlens, output, stream);
+  impl_->Run(params, input, cu_seqlens, output, stream, use_flash_attention, is_causal_);
 }
 
-bool FusedMHARunnerFP16v2::isValid(int s) const {
-  return pimpl->isValid(s);
+bool FusedMHARunnerFP16v2::IsValid(int normalized_sequence_length) const {
+  return impl_->IsValid(normalized_sequence_length);
 }
 
-int FusedMHARunnerFP16v2::getSFromMaxSeqLen(const int max_seq_len) const {
-  return pimpl->getSFromMaxSeqLen(max_seq_len);
+int FusedMHARunnerFP16v2::NormalizeSequenceLength(int max_seq_len) const {
+  return impl_->NormalizeSequenceLength(max_seq_len);
 }
 
-std::unique_ptr<MHARunner> FusedMHARunnerFP16v2::Create(const int numHeads,
-                                                                   const int headSize,
-                                                                   const int sm,
-                                                                   bool causal_mask,
-                                                                   bool enable_flash_attention,
-                                                                   const float scale) {
+std::unique_ptr<MHARunner> FusedMHARunnerFP16v2::Create(int num_heads,
+                                                        int head_size,
+                                                        int sm,
+                                                        bool causal,
+                                                        bool enable_flash_attention,
+                                                        const float scale) {
 #ifdef _MSC_VER
-  return std::make_unique<FusedMHARunnerFP16v2>(numHeads, headSize, sm, causal_mask, enable_flash_attention, scale);
+  return std::make_unique<FusedMHARunnerFP16v2>(num_heads, head_size, sm, causal, enable_flash_attention, scale);
 #else
-  // Linux build has error using make_unique: invalid application of ‘sizeof’ to incomplete type ‘onnxruntime::contrib::cuda::FusedMHARunnerFP16v2::mhaImpl
+  // Linux build has error using make_unique: invalid application of ‘sizeof’ to
+  // incomplete type ‘onnxruntime::contrib::cuda::FusedMHARunnerFP16v2::FmhaImpl
   std::unique_ptr<MHARunner> runner;
-  runner.reset(new FusedMHARunnerFP16v2(numHeads, headSize, sm, causal_mask, enable_flash_attention, scale));
+  runner.reset(new FusedMHARunnerFP16v2(num_heads, head_size, sm, causal, enable_flash_attention, scale));
   return runner;
 #endif
 }
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
index f7c1dc85361df..82914b07e524c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/mha_runner.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+// Modifications: Update interface and implmentation to be thread-safe
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #pragma once
 
 #include <memory>
@@ -25,103 +29,70 @@ namespace cuda {
 
 constexpr int kMinSequenceLengthFlashAttention = 385;
 
-// Multi-Head Attention runner
 class MHARunner {
  public:
-  MHARunner(const int numHeads, const int headSize, const int wordSize, bool causal_mask, const float scale)
-      : mS(0),
-        mB(0),
-        mOmatSize(0),
-        mNumMats(0),
-        mNumHeads(numHeads),
-        mHeadSize(headSize),
-        mWordSize(wordSize),
-        mLdQKV(0),
-        mStrideQKV(0),
-        mLdOut(0),
-        mStrideOut(0),
-        mScale(scale == 0.0f ? 1.f / sqrtf(static_cast<float>(headSize))
-                             : scale),
-        mHasCausalMask(causal_mask) {
+  MHARunner(int num_heads, int head_size, bool causal, float scale)
+      : num_heads_(num_heads),
+        head_size_(head_size),
+        scale_(scale == 0.0f ? 1.f / sqrtf(static_cast<float>(head_size)) : scale),
+        is_causal_(causal) {
   }
 
   virtual ~MHARunner() = default;
 
-  virtual void setup(const int S, const int B) {
-    ORT_ENFORCE(S > 0);
-    ORT_ENFORCE(B > 0);
-
-    mB = B;
-    mS = S;
-
-    mLdQKV = 3 * B * mNumHeads * mHeadSize;
-    mStrideQKV = 3 * mHeadSize;
-
-    mLdOut = B * mNumHeads * mHeadSize;
-    mStrideOut = mHeadSize;
-    mOmatSize = S * S;
-    mNumMats = B * mNumHeads;
-  }
-
-  virtual void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) = 0;
-
-  virtual size_t getWorkspaceSize() const = 0;
+  virtual int NormalizeSequenceLength(int max_seq_len) const = 0;
 
-  virtual bool isValid(int s) const = 0;
+  virtual bool IsValid(int normalized_sequence_length) const = 0;
 
-  virtual int getSFromMaxSeqLen(const int max_seq_len) const = 0;
+  virtual void Run(int batch_size,
+                   int normalized_sequence_length,
+                   const void* input,
+                   const void* cu_seqlens,
+                   void* output,
+                   cudaStream_t stream) const = 0;
 
  protected:
-  int mS;
-  int mB;
-  int mOmatSize;
-  int mNumMats;
-  int mNumHeads;
-  int mHeadSize;
-  int mWordSize;
-  int mLdQKV;
-  int mStrideQKV;
-  int mLdOut;
-  int mStrideOut;
-
-  float mScale;
-  bool mHasCausalMask;
+  int num_heads_;
+  int head_size_;
+  float scale_;
+  bool is_causal_;
 };
 
 class FusedMHARunnerFP16v2 : public MHARunner {
  public:
-  FusedMHARunnerFP16v2(const int numHeads,
-                       const int headSize,
-                       const int sm,
-                       bool causal_mask,
+  FusedMHARunnerFP16v2(int num_heads,
+                       int head_size,
+                       int sm,
+                       bool causal,
                        bool enable_flash_attention,
-                       const float scale);
-  ~FusedMHARunnerFP16v2() = default;  // for pimpl
-
-  virtual void setup(const int S, const int B) override;
+                       float scale);
 
-  static bool is_supported(int sm, int head_size, int sequence_length, bool enable_flash_attention, bool causal);
+  ~FusedMHARunnerFP16v2() = default;  // for impl_
 
-  void run(const void* input, const void* cu_seqlens, void* output, cudaStream_t stream) override;
+  static bool IsSupported(int sm, int head_size, int sequence_length, bool enable_flash_attention, bool causal);
 
-  size_t getWorkspaceSize() const override;
+  static std::unique_ptr<MHARunner> Create(int num_heads,
+                                           int head_size,
+                                           int sm,
+                                           bool causal,
+                                           bool enable_flash_attention,
+                                           float scale);
 
-  bool isValid(int s) const override;
+  bool IsValid(int normalized_sequence_length) const override;
 
-  int getSFromMaxSeqLen(const int max_seq_len) const override;
+  int NormalizeSequenceLength(int max_seq_len) const override;
 
-  static std::unique_ptr<MHARunner> Create(const int numHeads,
-                                           const int headSize,
-                                           const int sm,
-                                           bool causal_mask,
-                                           bool enable_flash_attention,
-                                           const float scale);
+  void Run(int batch_size,
+           int normalized_sequence_length,
+           const void* input,
+           const void* cu_seqlens,
+           void* output,
+           cudaStream_t stream) const override;
 
  private:
-  int mSm;
-  bool mEnableFlashAttention;
-  class mhaImpl;
-  std::unique_ptr<mhaImpl> pimpl;
+  bool enable_flash_attention_;
+  class FmhaImpl;
+  std::unique_ptr<FmhaImpl> impl_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/test/python/transformers/benchmark_mha.py b/onnxruntime/test/python/transformers/benchmark_mha.py
index 797461bae2efd..111c417479d20 100644
--- a/onnxruntime/test/python/transformers/benchmark_mha.py
+++ b/onnxruntime/test/python/transformers/benchmark_mha.py
@@ -156,6 +156,49 @@ def shape_dict(self, input_format=None):
             )
         return shapes
 
+    def symbolic_shape_dict(self, input_format=None):
+        input_format = input_format or self.input_format
+        if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
+            # cross attention does not have past state
+            return {
+                "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "key": ("batch_size", self.num_heads, "sequence_length", self.head_size),
+                "value": ("batch_size", self.num_heads, "sequence_length", self.head_size),
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+            }
+
+        if self.use_kv_cache:
+            shapes = {
+                "past_key": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
+                "past_value": ("batch_size", self.num_heads, "past_buffer_length", self.head_size),
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                "present_key": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
+                "present_value": ("batch_size", self.num_heads, "present_buffer_length", self.head_size),
+            }
+        else:
+            shapes = {
+                "output": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+            }
+
+        if input_format == InputFormats.QKV_BSN3H:
+            shapes.update({"query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size)})
+        elif input_format == InputFormats.Q_KV_BSNH_BSN2H:
+            shapes.update(
+                {
+                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size),
+                }
+            )
+        else:  # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH
+            shapes.update(
+                {
+                    "query": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "key": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                    "value": ("batch_size", "sequence_length", self.num_heads * self.head_size),
+                }
+            )
+        return shapes
+
     def random_inputs(self, seed: int = 123):
         device = self.device
         dtype = self.dtype
@@ -215,7 +258,7 @@ def random_inputs(self, seed: int = 123):
 
     def get_input_output_names(self):
         if self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH:
-            return ["query", "key"], ["output"]
+            return ["query", "key", "value"], ["output"]
 
         if self.input_format == InputFormats.QKV_BSN3H:
             inputs, outputs = ["query"], ["output"]
@@ -235,7 +278,7 @@ def fill_optional_mha_inputs(input_names):
     return input_names[:-2] + [""] * (len(inputs) - len(input_names)) + input_names[-2:]
 
 
-def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig):
+def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig, use_symbolic_shape=False):
     input_names, output_names = config.get_input_output_names()
 
     float_type = TensorProto.FLOAT16 if config.dtype == torch.float16 else TensorProto.FLOAT
@@ -252,7 +295,7 @@ def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig):
         ),
     ]
 
-    shape_dict = config.shape_dict()
+    shape_dict = config.symbolic_shape_dict() if use_symbolic_shape else config.shape_dict()
     inputs = [
         helper.make_tensor_value_info(input_name, float_type, list(shape_dict[input_name]))
         for input_name in input_names
diff --git a/onnxruntime/test/python/transformers/test_mha.py b/onnxruntime/test/python/transformers/test_mha.py
index 5335e7115ad78..ff473cc2ced92 100644
--- a/onnxruntime/test/python/transformers/test_mha.py
+++ b/onnxruntime/test/python/transformers/test_mha.py
@@ -7,17 +7,39 @@
 Test MultiHeadAttention operator for CUDA and CPU.
 """
 
+import concurrent.futures
 import itertools
 import unittest
-from typing import Optional
+from enum import IntEnum
+from typing import Dict, List, Optional
 
 import numpy
 import torch
-from benchmark_mha import InputFormats, MultiHeadAttentionConfig, OrtMultiHeadAttention
+from benchmark_mha import (
+    InputFormats,
+    MultiHeadAttentionConfig,
+    OrtMultiHeadAttention,
+    create_multi_head_attention_onnx_model,
+)
 from einops import rearrange
 from parameterized import parameterized
 
 import onnxruntime
+from onnxruntime import InferenceSession
+
+
+class SdpaKernel(IntEnum):
+    """Bit flags for sdpa_kernel CUDA provider option"""
+
+    DEFAULT = 0
+    FLASH_ATTENTION = 1
+    EFFICIENT_ATTENTION = 2
+    TRT_FUSED_ATTENTION = 4
+    CUDNN_FLASH_ATTENTION = 8
+    MATH = 16
+    TRT_FLASH_ATTENTION = 32
+    TRT_CROSS_ATTENTION = 64
+    TRT_CAUSAL_ATTENTION = 128
 
 
 def attention_reference(
@@ -105,9 +127,16 @@ def mha_with_past_reference(
 
 def get_provider_support_info(provider: str, use_kv_cache: bool):
     if provider == "CUDAExecutionProvider":
-        formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H]
         if not use_kv_cache:
-            formats.append(InputFormats.Q_K_V_BSNH_BSNH_BSNH)
+            formats = [
+                InputFormats.Q_K_V_BSNH_BSNH_BSNH,
+                InputFormats.Q_KV_BSNH_BSN2H,
+                InputFormats.QKV_BSN3H,
+                InputFormats.Q_K_V_BSNH_BNSH_BNSH,
+            ]
+        else:
+            formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH]
+
         device_id = torch.cuda.current_device()
         device = torch.device("cuda", device_id)
         dtype = torch.float16
@@ -121,15 +150,16 @@ def get_provider_support_info(provider: str, use_kv_cache: bool):
     return device, dtype, formats
 
 
-def has_cuda_support():
+def get_compute_capability():
     if torch.cuda.is_available() and "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-        major, _ = torch.cuda.get_device_capability()
-        return major >= 6
-    return False
+        major, minor = torch.cuda.get_device_capability()
+        sm = major * 10 + minor
+        return sm
+    return 0
 
 
 def no_kv_cache_test_cases(provider: str, comprehensive: bool):
-    if provider == "CUDAExecutionProvider" and not has_cuda_support():
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
         return
         yield
 
@@ -192,7 +222,7 @@ def no_kv_cache_test_cases(provider: str, comprehensive: bool):
 
 
 def kv_cache_test_cases(provider: str, comprehensive: bool):
-    if provider == "CUDAExecutionProvider" and not has_cuda_support():
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
         return
         yield
 
@@ -262,6 +292,92 @@ def mha_test_cases(provider: str, comprehensive: bool):
     )
 
 
+def no_kv_cache_multi_thread_test_cases(provider: str, comprehensive: bool):
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
+        return
+        yield
+
+    batch_sizes = [1, 2]
+    sequence_lengths = [1, 16, 127, 128, 255, 256, 383, 384, 400] if comprehensive else [1, 64, 128, 256]
+    heads = [4]
+    head_sizes = [8, 16, 32, 40, 64, 80, 96, 128, 160, 192, 224, 256] if comprehensive else [32, 64]
+
+    device, dtype, formats = get_provider_support_info(provider, False)
+
+    for format in formats:
+        for causal in [True, False]:
+            for num_heads in heads:
+                for head_size in head_sizes:
+                    configs = []  # list of configurations to run in parallel
+                    for batch_size in batch_sizes:
+                        for sequence_length in sequence_lengths:
+                            config = MultiHeadAttentionConfig(
+                                batch_size=batch_size,
+                                sequence_length=sequence_length,
+                                num_heads=num_heads,
+                                head_size=head_size,
+                                causal=causal,
+                                past_sequence_length=0,
+                                kv_sequence_length=sequence_length,
+                                max_cache_sequence_length=None,
+                                provider=provider,
+                                device=device,
+                                dtype=dtype,
+                                use_kv_cache=False,
+                                share_past_present_buffer=False,
+                                input_format=format,
+                            )
+                            configs.append(config)
+                    yield configs
+
+
+def kv_cache_multi_thread_test_cases(provider: str, comprehensive: bool):
+    if provider == "CUDAExecutionProvider" and get_compute_capability() < 60:
+        return
+        yield
+
+    batch_sizes = [1, 2]
+    sequence_lengths = [1, 32, 127, 128, 383, 384, 400] if comprehensive else [1, 32, 127, 128]
+    heads = [4]
+    head_sizes = [8, 16, 32, 40, 64, 80, 96, 128, 160, 192, 224, 256] if comprehensive else [32, 64]
+
+    sequence_length = 1
+    device, dtype, formats = get_provider_support_info(provider, True)
+
+    for format in formats:
+        for causal in [True, False]:
+            for num_heads in heads:
+                for head_size in head_sizes:
+                    configs = []
+                    for batch_size in batch_sizes:
+                        for past_sequence_length in sequence_lengths:
+                            config = MultiHeadAttentionConfig(
+                                batch_size=batch_size,
+                                sequence_length=sequence_length,
+                                num_heads=num_heads,
+                                head_size=head_size,
+                                causal=causal,
+                                past_sequence_length=past_sequence_length,
+                                kv_sequence_length=sequence_length,
+                                max_cache_sequence_length=None,
+                                provider=provider,
+                                device=device,
+                                dtype=dtype,
+                                use_kv_cache=True,
+                                share_past_present_buffer=False,
+                                input_format=format,
+                            )
+                            configs.append(config)
+                    yield configs
+
+
+def multi_thread_test_cases(provider: str, comprehensive: bool):
+    return itertools.chain(
+        no_kv_cache_multi_thread_test_cases(provider, comprehensive),
+        kv_cache_multi_thread_test_cases(provider, comprehensive),
+    )
+
+
 def causal_mask(seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, device=None):
     row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
     col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
@@ -346,20 +462,189 @@ def parity_check_mha(
         )
 
 
+def parity_check_mha_multi_threading(
+    test_inputs: List[Dict],
+    rtol: float = 1e-3,
+    atol: float = 1e-3,
+    sdpa_kernel: int = SdpaKernel.DEFAULT,
+    max_threads: int = 5,
+    verbose: bool = False,
+):
+    # Use the first config to create a session, which is shared by all configs to run in parallel.
+    config = test_inputs[0]["config"]
+    # For now, MHA CUDA kernel does not support causal so skip such test cases.
+    if config.causal and config.provider == "CUDAExecutionProvider":
+        return None
+    # Some kernel does not support certain input format.
+    if sdpa_kernel not in [
+        SdpaKernel.DEFAULT,
+        SdpaKernel.FLASH_ATTENTION,
+        SdpaKernel.EFFICIENT_ATTENTION,
+    ] and config.input_format in [InputFormats.Q_KV_BSNH_BSN2H]:
+        return None
+    if verbose:
+        print(f"create a shared session with {vars(config)}")
+    onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=True)
+    if config.provider == "CUDAExecutionProvider":
+        provider_options = {"arena_extend_strategy": "kSameAsRequested", "sdpa_kernel": int(sdpa_kernel)}
+        providers = [(config.provider, provider_options), "CPUExecutionProvider"]
+    else:
+        providers = ["CPUExecutionProvider"]
+    ort_session = InferenceSession(onnx_model_str, providers=providers)
+
+    def convert_to_ort_inputs(feed_dict):
+        ort_inputs = {}
+
+        for k, v in feed_dict.items():
+            if isinstance(v, numpy.ndarray):
+                ort_inputs[k] = v
+            else:
+                ort_inputs[k] = v.detach().cpu().numpy()
+        return ort_inputs
+
+    def check_parity_with_config(i: int):
+        config = test_inputs[i]["config"]
+        if verbose:
+            print(f"Thread {i} with {vars(config)}")
+
+        ort_inputs = test_inputs[i]["ort_inputs"]
+
+        if verbose:
+            print(f"Thread {i} ort inputs: {ort_inputs}")
+        ort_outputs = ort_session.run(None, convert_to_ort_inputs(ort_inputs))
+        out = numpy.reshape(
+            ort_outputs[0], (config.batch_size, config.sequence_length, config.num_heads, config.head_size)
+        )
+
+        # Create reference inputs
+        config.input_format = InputFormats.Q_K_V_BSNH_BSNH_BSNH
+        ref_inputs = test_inputs[i]["ref_inputs"]
+        if verbose:
+            print(f"Thread {i} ref inputs: {ref_inputs}")
+        q = (
+            ref_inputs["query"]
+            .reshape((config.batch_size, config.sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+        k = (
+            ref_inputs["key"]
+            .reshape((config.batch_size, config.kv_sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+        v = (
+            ref_inputs["value"]
+            .reshape((config.batch_size, config.kv_sequence_length, config.num_heads, config.head_size))
+            .transpose(1, 2)
+        )
+
+        mask = None
+        if config.causal:
+            mask = causal_mask(config.sequence_length, config.total_sequence_length, device=config.device)
+
+        k_cache = None
+        v_cache = None
+        if config.use_kv_cache:
+            past_k = ref_inputs["past_key"]
+            past_v = ref_inputs["past_value"]
+            out_ref, k_cache, v_cache = mha_with_past_reference(config, past_k, past_v, q, k, v, mask=mask)
+        else:
+            out_ref = attention_reference(config.head_size, q, k, v, mask=mask)
+
+        try:
+            numpy.testing.assert_allclose(
+                out,
+                out_ref.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True,
+                err_msg=f"output not close: {config=}",
+            )
+
+            if config.use_kv_cache:
+                present_key = ort_outputs[1]
+                numpy.testing.assert_allclose(
+                    k_cache.detach().cpu().numpy(),
+                    present_key,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=True,
+                    err_msg=f"present_key not close: {config=}",
+                )
+
+                present_value = ort_outputs[2]
+                numpy.testing.assert_allclose(
+                    v_cache.detach().cpu().numpy(),
+                    present_value,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=True,
+                    err_msg=f"present_value not close: {config=}",
+                )
+        except AssertionError as e:
+            print(f"Failed with {vars(config)}: {e}")
+            return e
+
+        if verbose:
+            print(f"Passed: {vars(config)}")
+        return None
+
+    num_threads = min(max_threads, len(test_inputs))
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future_tasks = [executor.submit(check_parity_with_config, i) for i in range(num_threads)]
+        for future in concurrent.futures.as_completed(future_tasks):
+            result = future.result()
+            if result is not None:
+                return result
+
+    return None
+
+
 # Do not run too many tests in CI pipeline. Change it to True to run all combinations in dev machine.
 comprehensive_mode = False
 
 
 class TestMultiHeadAttention(unittest.TestCase):
-    # TODO: enable tests on CUDAExecutionProvider after fixing the issue.
-    # @parameterized.expand(mha_test_cases("CUDAExecutionProvider", comprehensive_mode), skip_on_empty=True)
-    # def test_mha_cuda(self, config):
-    #     parity_check_mha(config)
+    @parameterized.expand(mha_test_cases("CUDAExecutionProvider", comprehensive_mode), skip_on_empty=True)
+    def test_mha_cuda(self, config):
+        parity_check_mha(config)
 
     @parameterized.expand(mha_test_cases("CPUExecutionProvider", comprehensive_mode), skip_on_empty=True)
     def test_mha_cpu(self, config):
         parity_check_mha(config)
 
+    def run_mha_cuda_multi_threading(self, spda_kernel):
+        for configs in multi_thread_test_cases("CUDAExecutionProvider", comprehensive_mode):
+            test_inputs = []
+            for config in configs:
+                ort_inputs = config.random_inputs()
+
+                # Create reference inputs
+                old_format = config.input_format
+                config.input_format = InputFormats.Q_K_V_BSNH_BSNH_BSNH
+                ref_inputs = config.random_inputs()
+                config.input_format = old_format
+                test_inputs.append({"config": config, "ort_inputs": ort_inputs, "ref_inputs": ref_inputs})
+
+            exception = parity_check_mha_multi_threading(test_inputs, sdpa_kernel=spda_kernel, max_threads=len(configs))
+            assert exception is None, f"{spda_kernel=}, {vars(configs[0])}, {exception}"
+
+    def test_mha_cuda_multi_threading(self):
+        self.run_mha_cuda_multi_threading(SdpaKernel.DEFAULT)
+
+    def test_mha_cuda_multi_threading_efficient(self):
+        self.run_mha_cuda_multi_threading(SdpaKernel.EFFICIENT_ATTENTION)
+
+    def test_mha_cuda_multi_threading_trt(self):
+        sm = get_compute_capability()
+        if sm in [75, 80, 86, 89]:
+            self.run_mha_cuda_multi_threading(
+                SdpaKernel.TRT_FUSED_ATTENTION
+                | SdpaKernel.TRT_FLASH_ATTENTION
+                | SdpaKernel.TRT_CROSS_ATTENTION
+                | SdpaKernel.TRT_CAUSAL_ATTENTION
+            )
+
 
 if __name__ == "__main__":
     with torch.no_grad():

From 17e9ea62352b71e3d432a66b70e666ade128cac6 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 23 Jul 2024 02:56:09 +0800
Subject: [PATCH 29/35] [WebNN EP] Add outputDataType option for the
 ArgMax/ArgMin ops (#21385)

### Description
WebNN spec introduces a new option: `outputDataType` to `argMax` and
`argMin` ops, it's default value is `int32`, we should explicitly set it
to `int64` for WebNN EP.

Spec CR: "Add outputDataType to argmin/argmax"
https://github.com/webmachinelearning/webnn/pull/730
---
 .../providers/webnn/builders/impl/argmax_min_op_builder.cc   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index f8b77b6350a76..1330a3e354871 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -50,6 +50,11 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   options.set("axes", axes);
   options.set("keepDimensions", keep_dims == 1);
   options.set("selectLastIndex", select_last_index == 1);
+  // TODO: use WebNN's opSupportLimits API to check the backend's supported output data types.
+  // If the backend doesn't support int64 output, we should use default int32 output data type
+  // then do a type casting (int32 -> int64) for the output. Refer to the CoreML EP for how to
+  // support int64 output.
+  options.set("outputDataType", "int64");
   emscripten::val output = emscripten::val::object();
 
   const auto& op_type = node.OpType();

From 4e75605eec985e579ee1e8db9a2bb2fd441a4837 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 22 Jul 2024 12:39:10 -0700
Subject: [PATCH 30/35] Replace inline pip install with pip install from
 requirements*.txt (#21106)

### Description
Replace inline pip install with pip install from requirements*.txt


### Motivation and Context
so that CG can recognize

### Dependency

- [x] https://github.com/microsoft/onnxruntime/pull/21085
---
 tools/ci_build/build.py                       | 27 +++++++++----------
 .../orttraining-pai-ci-pipeline.yml           |  2 +-
 .../requirements/pybind/requirements.txt      |  8 ++++++
 .../transformers-test/requirements.txt}       |  0
 4 files changed, 21 insertions(+), 16 deletions(-)
 create mode 100644 tools/ci_build/requirements/pybind/requirements.txt
 rename tools/ci_build/{requirements-transformers-test.txt => requirements/transformers-test/requirements.txt} (100%)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 75fbf5d0851ae..54f7b6c3a8fa7 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -261,9 +261,6 @@ def convert_arg_line_to_args(self, arg_line):
         "--wheel_name_suffix",
         help="Suffix to append to created wheel names. This value is currently only used for nightly builds.",
     )
-    parser.add_argument(
-        "--numpy_version", help="Installs a specific version of numpy before building the python binding."
-    )
     parser.add_argument("--skip-keras-test", action="store_true", help="Skip tests with Keras if keras is installed")
 
     # C-Sharp bindings
@@ -868,16 +865,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def install_python_deps(numpy_version=""):
-    dep_packages = ["setuptools", "wheel", "pytest"]
-    dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
-    dep_packages.append("sympy>=1.10")
-    dep_packages.append("packaging")
-    dep_packages.append("cerberus")
-    dep_packages.append("psutil")
-    run_subprocess([sys.executable, "-m", "pip", "install", *dep_packages])
-
-
 def setup_test_data(source_onnx_model_dir, dest_model_dir_name, build_dir, configs):
     # create the symlink/shortcut of onnx models dir under build_dir
     # currently, there're 2 sources of onnx models, one is build in OS image, another is
@@ -2146,7 +2133,14 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                         numpy_init_version = numpy.__version__
                         pb_init_version = google.protobuf.__version__
                         run_subprocess(
-                            [sys.executable, "-m", "pip", "install", "-r", "requirements-transformers-test.txt"],
+                            [
+                                sys.executable,
+                                "-m",
+                                "pip",
+                                "install",
+                                "-r",
+                                "requirements/transformers-test/requirements.txt",
+                            ],
                             cwd=SCRIPT_DIR,
                         )
                         run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd)
@@ -2818,7 +2812,10 @@ def main():
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
         if args.enable_pybind and is_windows():
-            install_python_deps(args.numpy_version)
+            run_subprocess(
+                [sys.executable, "-m", "pip", "install", "-r", "requirements/pybind/requirements.txt"],
+                cwd=SCRIPT_DIR,
+            )
 
         if args.use_rocm and args.rocm_version is None:
             args.rocm_version = ""
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 0e1afdcc5b8ca..2c520a25cb39e 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -276,7 +276,7 @@ jobs:
           onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test \
             /bin/bash -c "
               set -ex; \
-              pip install -r /onnxruntime_src/tools/ci_build/requirements-transformers-test.txt; \
+              pip install -r /onnxruntime_src/tools/ci_build/requirements/transformers-test/requirements.txt; \
               pytest /onnxruntime_src/onnxruntime/test/python/transformers/test_flash_attn_rocm.py -v -n 4 --reruns 1"
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Run tranformers tests'
diff --git a/tools/ci_build/requirements/pybind/requirements.txt b/tools/ci_build/requirements/pybind/requirements.txt
new file mode 100644
index 0000000000000..8f00a25627c21
--- /dev/null
+++ b/tools/ci_build/requirements/pybind/requirements.txt
@@ -0,0 +1,8 @@
+setuptools
+wheel
+pytest
+numpy>=1.19.0
+sympy>=1.10
+packaging
+cerberus
+psutil
diff --git a/tools/ci_build/requirements-transformers-test.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
similarity index 100%
rename from tools/ci_build/requirements-transformers-test.txt
rename to tools/ci_build/requirements/transformers-test/requirements.txt

From 5b9369e93c704f55fd6a4e49934f41dccffe55a5 Mon Sep 17 00:00:00 2001
From: mindest <30493312+mindest@users.noreply.github.com>
Date: Tue, 23 Jul 2024 04:37:32 +0800
Subject: [PATCH 31/35] Fix typos according to reviewdog report. (#21335)

### Description
Fix typos based on reviewdog report but with some
exceptions/corrections.
---
 .gitattributes                                |   2 +-
 ThirdPartyNotices.txt                         |   2 +-
 cmake/onnxruntime.cmake                       |   2 +-
 .../composable_kernel/Fix_Clang_Build.patch   |   2 +-
 .../default/partials/title.tmpl.partial       |   2 +-
 dockerfiles/README.md                         |   4 +-
 .../onnx-inference-byoc-gpu-cpu-aks.ipynb     |   4 +-
 .../platform/EigenNonBlockingThreadPool.h     |   2 +-
 .../core/providers/cuda/cuda_context.h        |  12 +-
 .../core/providers/cuda/cuda_resource.h       |   2 +-
 .../core/providers/rocm/rocm_context.h        |   9 +-
 .../core/providers/rocm/rocm_resource.h       |   2 +-
 .../core/session/onnxruntime_c_api.h          |  19 ++--
 .../core/session/onnxruntime_lite_custom_op.h |   2 +-
 java/build.gradle                             |   2 +-
 .../main/java/ai/onnxruntime/OnnxRuntime.java |   2 +-
 .../onnxruntime/providers/package-info.java   |   2 +-
 java/src/test/java/sample/ScoreMNIST.java     |   2 +-
 .../backends/webgl/glsl-coordinate-lib.ts     |   2 +-
 js/web/lib/onnxjs/backends/webgl/ops/pack.ts  |   2 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.h         |   2 +-
 .../cpu/transformers/sampling_cpu_helper.h    |   2 +-
 onnxruntime/core/codegen/common/common.cc     |   2 +-
 onnxruntime/core/codegen/mti/common.h         |   2 +-
 .../passes/scheduler/schedule_utils.cc        |   4 +-
 .../codegen/passes/scheduler/schedule_utils.h |   4 +-
 .../passes/scheduler/tvm_schedule_builder.cc  |   2 +-
 .../passes/weight_layout/weight_layout.h      |   2 +-
 onnxruntime/core/common/logging/logging.cc    |   2 +-
 onnxruntime/core/common/status.cc             |   2 +-
 .../core/framework/allocation_planner.cc      |   4 +-
 .../core/framework/allocation_planner.h       |   6 +-
 .../framework/device_stream_collection.cc     |   3 +-
 onnxruntime/core/framework/execution_frame.h  |   2 +-
 .../partial_graph_execution_state.cc          |   2 +-
 .../framework/sequential_execution_plan.h     |   6 +-
 .../core/framework/sequential_executor.cc     |   2 +-
 onnxruntime/core/framework/session_options.h  |   2 +-
 onnxruntime/core/framework/session_state.cc   |   4 +-
 onnxruntime/core/framework/sparse_tensor.cc   |   4 +-
 onnxruntime/core/framework/tensorprotoutils.h |   6 +-
 onnxruntime/core/framework/utils.h            |   2 +-
 .../mickey/gemm/warp/quantb_meta_loader.h     |   8 +-
 onnxruntime/core/mlas/lib/convolve.cpp        |   4 +-
 .../core/optimizer/attention_fusion_helper.h  |   4 +-
 .../free_dim_override_transformer.cc          |   4 +-
 .../core/optimizer/insert_cast_transformer.cc |   6 +-
 .../onnx_transpose_optimization.cc            |   4 +-
 .../core/providers/acl/nn/batch_norm.cc       |   2 +-
 onnxruntime/core/providers/acl/nn/pool.cc     |   2 +-
 .../providers/armnn/activation/activations.cc |   2 +-
 onnxruntime/core/providers/armnn/math/gemm.h  |   2 +-
 .../core/providers/armnn/nn/batch_norm.cc     |   2 +-
 onnxruntime/core/providers/armnn/nn/conv.cc   |   2 +-
 onnxruntime/core/providers/armnn/nn/pool.cc   |   4 +-
 .../math/einsum_utils/einsum_auxiliary_ops.cc |   4 +-
 .../einsum_typed_compute_processor.cc         |   2 +-
 .../cpu/object_detection/roialign.cc          |   2 +-
 .../providers/cpu/sequence/sequence_ops.cc    |   2 +-
 .../core/providers/cpu/tensor/unique.cc       |   2 +-
 .../core/providers/cuda/cuda_allocator.cc     |   2 +-
 .../core/providers/cuda/cuda_stream_handle.cc |   2 +-
 .../cuda/math/softmax_blockwise_impl.cuh      |   2 +-
 onnxruntime/core/providers/cuda/nn/conv.cc    |   4 +-
 .../cuda/object_detection/roialign_impl.cu    | 106 +++++++++---------
 .../providers/cuda/reduction/reduction_ops.cc |   2 +-
 .../core/providers/cuda/tensor/resize_impl.cu |   6 +-
 .../providers/cuda/tensor/transpose_impl.cu   |   4 +-
 .../src/Operators/DmlOperatorFusedMatMul.cpp  |   2 +-
 .../providers/dnnl/dnnl_execution_provider.cc |   2 +-
 .../providers/dnnl/dnnl_node_capability.cc    |   6 +-
 .../core/providers/dnnl/subgraph/dnnl_conv.h  |   8 +-
 .../providers/dnnl/subgraph/dnnl_convgrad.cc  |   2 +-
 .../providers/dnnl/subgraph/dnnl_convgrad.h   |   4 +-
 .../dnnl/subgraph/dnnl_dequantizelinear.cc    |   2 +-
 .../providers/dnnl/subgraph/dnnl_matmul.cc    |   8 +-
 .../providers/dnnl/subgraph/dnnl_reduce.cc    |   8 +-
 .../dnnl/subgraph/dnnl_subgraph_primitive.h   |   2 +-
 .../providers/dnnl/subgraph/dnnl_transpose.cc |   5 +-
 .../providers/migraphx/migraphx_allocator.cc  |   2 +-
 .../migraphx/migraphx_stream_handle.cc        |   2 +-
 .../nnapi_lib/nnapi_implementation.cc         |   2 +-
 .../rknpu/rknpu_execution_provider.cc         |   4 +-
 onnxruntime/core/providers/rocm/nn/conv.cc    |   2 +-
 .../providers/rocm/reduction/reduction_ops.cc |   2 +-
 .../core/providers/rocm/rocm_allocator.cc     |   2 +-
 .../core/providers/rocm/rocm_stream_handle.cc |   2 +-
 .../webnn/webnn_execution_provider.cc         |   2 +-
 onnxruntime/core/session/IOBinding.cc         |   2 +-
 onnxruntime/core/session/inference_session.cc |   2 +-
 onnxruntime/core/session/inference_session.h  |   2 +-
 onnxruntime/core/util/qmath.h                 |   8 +-
 .../python/onnxruntime_pybind_schema.cc       |   2 +-
 .../onnxruntime_pybind_sparse_tensor.cc       |   4 +-
 .../python/onnxruntime_pybind_state.cc        |  18 +--
 .../python/tools/quantization/calibrate.py    |   4 +-
 .../tools/quantization/operators/direct_q8.py |   2 +-
 .../python/tools/quantization/quant_utils.py  |   2 +-
 .../python/tools/transformers/README.md       |   2 +-
 .../python/tools/transformers/benchmark.py    |   2 +-
 .../transformers/large_model_exporter.py      |   2 +-
 .../models/gpt2/benchmark_gpt2.py             |   2 +-
 .../models/gpt2/convert_to_onnx.py            |   2 +-
 .../transformers/models/gpt2/gpt2_tester.py   |   2 +-
 .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb  |   2 +-
 .../tools/transformers/onnx_model_phi.py      |   2 +-
 .../tools/transformers/onnx_model_tnlr.py     |   2 +-
 .../python/tools/transformers/optimizer.py    |   2 +-
 .../tools/transformers/shape_optimizer.py     |   2 +-
 .../contrib_ops/attention_lstm_op_test.cc     |  16 +--
 .../test/framework/allocation_planner_test.cc |   2 +-
 .../test/framework/inference_session_test.cc  |   2 +-
 onnxruntime/test/framework/tunable_op_test.cc |   2 +-
 .../test/fuzzing/include/BetaDistribution.h   |   4 +-
 onnxruntime/test/fuzzing/src/test.cpp         |   2 +-
 onnxruntime/test/ir/graph_test.cc             |   4 +-
 .../test/ir/schema_registry_manager_test.cc   |   4 +-
 .../test/mlas/unittest/test_fgemm_fixture.h   |   2 +-
 .../test/mlas/unittest/test_halfgemm.cpp      |   2 +-
 .../test/mlas/unittest/test_pool2d_fixture.h  |   2 +-
 .../test/mlas/unittest/test_pool3d_fixture.h  |   2 +-
 .../test/mlas/unittest/test_qgemm_fixture.h   |   2 +-
 .../test/mlas/unittest/test_sbgemm.cpp        |   4 +-
 .../mlas/unittest/test_symm_qgemm_fixture.h   |   2 +-
 .../optimizer/transpose_optimizer_test.cc     |   4 +-
 onnxruntime/test/perftest/ReadMe.txt          |   2 +-
 onnxruntime/test/perftest/ort_test_session.cc |   2 +-
 .../platform/android/cxa_demangle_test.cc     |   2 +-
 .../providers/cpu/controlflow/scan_test.cc    |   2 +-
 .../cpu/rnn/deep_cpu_lstm_op_test.cc          |   4 +-
 .../cuda/test_cases/allocator_cuda_test.cc    |   2 +-
 .../matmul_post_op_transform_test.cc          |   6 +-
 .../internal_testing_partitioning_tests.cc    |   2 +-
 .../test/providers/qnn/qnn_ep_context_test.cc |   4 +-
 .../test/python/onnxruntime_test_python.py    |   2 +-
 .../python/onnxruntime_test_python_mlops.py   |   2 +-
 .../onnxruntime_test_python_sparse_matmul.py  |   2 +-
 .../test_quantize_static_resnet.py            |   2 +-
 .../python/transformers/test_generation.py    |   2 +-
 onnxruntime/test/shared_lib/test_inference.cc |   6 +-
 .../model_creation_for_testing.ipynb          |   2 +-
 .../self_attention_megatron_basic_test.py     |   2 +-
 ...transpose_optimizer_shared_initializers.py |   2 +-
 onnxruntime/wasm/api.h                        |   2 +-
 .../core/framework/adasum/adasum_interface.h  |   2 +-
 .../core/framework/ortmodule_graph_builder.cc |   2 +-
 .../orttraining/core/framework/pipeline.cc    |   2 +-
 .../core/graph/mixed_precision_transformer.cc |   2 +-
 .../core/graph/optimizer_graph_builder.h      |   2 +-
 .../core/graph/pipeline_transformer.cc        |   4 +-
 .../core/graph/training_op_defs.cc            |  22 ++--
 .../core/optimizer/graph_transformer_config.h |   2 +-
 .../core/session/training_session.cc          |   4 +-
 orttraining/orttraining/models/bert/main.cc   |  16 +--
 orttraining/orttraining/models/mnist/main.cc  |  16 +--
 .../models/runner/training_runner.cc          |   2 +-
 .../python/training/onnxblock/blocks.py       |   4 +-
 .../python/training/ortmodule/__init__.py     |   2 +-
 .../cuda/fused_ops/fused_ops_frontend.cpp     |   6 +-
 .../test/distributed/partition_utils.h        |   6 +-
 .../orttraining/test/graph/bert_toy_fetches.h |   2 +-
 .../python/orttraining_test_ortmodule_api.py  |   2 +-
 .../test/python/qat_poc_example/qat.py        |   2 +-
 .../training_ops/cuda/cross_entropy_test.cc   |   2 +-
 .../orttraining/training_api/optimizer.cc     |   6 +-
 .../orttraining/training_api/optimizer.h      |   2 +-
 .../cpu/activation/activations_grad.cc        |   2 +-
 .../training_ops/cuda/math/div_grad_impl.cu   |   2 +-
 .../training_ops/cuda/optimizer/lamb.cc       |   2 +-
 .../training_ops/cuda/optimizer/lamb_impl.cu  |   2 +-
 .../tools/scripts/layer_norm_transform.py     |   2 +-
 orttraining/tools/scripts/model_transform.py  |   2 +-
 .../tools/scripts/opset12_model_transform.py  |   2 +-
 rust/onnxruntime-sys/examples/c_api_sample.rs |   4 +-
 .../src/tensor/ort_output_tensor.rs           |   2 +-
 tools/ci_build/build.py                       |   4 +-
 .../azure-pipelines/web-ci-pipeline.yml       |   2 +-
 tools/python/util/android/android.py          |   2 +-
 winml/adapter/winml_adapter_session.cpp       |   2 +-
 ...rosoft.AI.MachineLearning.Experimental.idl |   2 +-
 winml/api/Windows.AI.MachineLearning.idl      |   2 +-
 winml/lib/Api/LearningModelBinding.cpp        |   2 +-
 winml/lib/Api/impl/NumericData.h              |   2 +-
 .../test/api/LearningModelSessionAPITest.cpp  |   2 +-
 winml/test/common/googleTestMacros.h          |   2 +-
 winml/test/common/taefTestMacros.h            |   2 +-
 winml/test/common/test.h                      |   6 +-
 winml/test/image/imagetests.cpp               |   4 +-
 winml/test/model/model_tests.cpp              |   2 +-
 189 files changed, 380 insertions(+), 360 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index 41eae6dac52f5..8bfd419922d6b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1,4 @@
-# This sets the default behaviour, overriding core.autocrlf
+# This sets the default behavior, overriding core.autocrlf
 * text=auto
 
 # All source files should have unix line-endings in the repository,
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 8ec770da22159..6a11f414361bd 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -4820,7 +4820,7 @@ SOFTWARE.
 
 ----------------------------------------------------------------------------
 
-This is the MIT/Expat Licence. For more information see:
+This is the MIT/Expat License. For more information see:
 
 1. http://www.opensource.org/licenses/mit-license.php
 
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 21ae0947f3788..0e89c2f14d34b 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -150,7 +150,7 @@ endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
   # target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
-  # affecting downstream ort library users with the behaviour of dummy __cxa_demangle. So the dummy
+  # affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
   # __cxa_demangle must not expose to libonnxruntime_common.a. It works as when the linker is
   # creating the DSO, our dummy __cxa_demangle always comes before libc++abi.a so the
   # __cxa_demangle in libc++abi.a is discarded, thus, huge binary size reduction.
diff --git a/cmake/patches/composable_kernel/Fix_Clang_Build.patch b/cmake/patches/composable_kernel/Fix_Clang_Build.patch
index 0352f8eb9bb34..73ece647d82c7 100644
--- a/cmake/patches/composable_kernel/Fix_Clang_Build.patch
+++ b/cmake/patches/composable_kernel/Fix_Clang_Build.patch
@@ -44,7 +44,7 @@ index c23746e7f..bc326c8b5 100644
  find_package(HIP REQUIRED)
  # Override HIP version in config.h, if necessary.
 @@ -269,12 +248,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
-     message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+     message(STATUS "CK_HIP_VERSION_PATCH overridden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
  endif()
  message(STATUS "Build with HIP ${HIP_VERSION}")
 -link_libraries(hip::device)
diff --git a/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial b/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
index 38c62fe55f603..fd589fd74877c 100644
--- a/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
+++ b/csharp/ApiDocs/_exported_templates/default/partials/title.tmpl.partial
@@ -39,7 +39,7 @@ Event {{name.0.value}}
 Operator {{name.0.value}}
 {{/inOperator}}
 {{#inEii}}
-Explict Interface Implementation {{name.0.value}}
+Explicit Interface Implementation {{name.0.value}}
 {{/inEii}}
 {{#inVariable}}
 Variable {{name.0.value}}
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index a2e99d66d4654..008587a01082b 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -32,7 +32,7 @@
   docker run -it onnxruntime-source
   ```
 
-The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explictly specify which CPU architecture you want to build. For example:
+The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "--platform" parameter to explicitly specify which CPU architecture you want to build. For example:
 
 ```bash
   docker build --platform linux/arm64/v8 -f Dockerfile.source
@@ -274,7 +274,7 @@ Note: You may add --use_tensorrt and --tensorrt_home options if you wish to use
 Note: Resulting Docker image will have ONNX Runtime installed in /usr, and ONNX Runtime wheel copied to /onnxruntime directory.
 Nothing else from ONNX Runtime source tree will be copied/installed to the image.
 
-Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
+Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropriate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
 
 ## MIGraphX
 **Ubuntu 20.04, ROCm6.0, MIGraphX**
diff --git a/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb b/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
index be34a812c77db..c1278b63a84d3 100644
--- a/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
+++ b/docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb
@@ -64,7 +64,7 @@
     "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, please follow the [Azure ML configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) to set up your environment.\n",
     "\n",
     "### Install additional packages needed for this Notebook\n",
-    "You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Maching Learning SDK is installed.\n",
+    "You need to install the popular plotting library matplotlib, the image manipulation library opencv, and the onnx library in the conda environment where Azure Machine Learning SDK is installed.\n",
     "\n",
     "```\n",
     "(myenv) $ pip install matplotlib onnx opencv-python\n",
@@ -79,7 +79,7 @@
    "source": [
     "## 1. Obtain a model from the ONNX Model Zoo\n",
     "\n",
-    "For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaning how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
+    "For more information on the Facial Emotion Recognition (FER+) model, you can explore the notebook explaining how to deploy [FER+ with ONNX Runtime on an ACI Instance](onnx-inference-facial-expression-recognition-deploy.ipynb)."
    ]
   },
   {
diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
index f9b694efb936f..e33007102e198 100644
--- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
+++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -1129,7 +1129,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
   //
   // Ensure that the ThreadPoolParallelSection has sufficient workers to
   // execute a loop with degree of parallelism n.  We track the number
-  // of workers already avaiable to the parallel section, prior to
+  // of workers already available to the parallel section, prior to
   // submitting tasks to the work queues to make up the total.
   //
   // Each worker will call in to worker_fn(idx) with a per-worker thread
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 9ada01673d4d9..462b31bb433a5 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -53,7 +53,8 @@ struct CudaContext : public CustomOpContext {
     cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
 
     cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
-    enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
+    enable_skip_layer_norm_strict_mode = FetchResource<bool>(
+        kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
     prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
     use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
   }
@@ -61,13 +62,16 @@ struct CudaContext : public CustomOpContext {
   template <typename T>
   T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
     if constexpr (sizeof(T) > sizeof(void*)) {
-      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
+      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_INVALID_ARGUMENT);
     }
     const auto& ort_api = Ort::GetApi();
     void* resource = {};
-    OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource);
+    OrtStatus* status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_CUDA_RESOURCE_VERSION, resource_type, &resource);
     if (status) {
-      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     T t = {};
     memcpy(&t, &resource, sizeof(T));
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 00e7dec5727d1..555023c442c01 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -3,7 +3,7 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_CUDA_RESOUCE_VERSION 3
+#define ORT_CUDA_RESOURCE_VERSION 3
 
 enum CudaResource : int {
   cuda_stream_t = cuda_resource_offset,  // 10000
diff --git a/include/onnxruntime/core/providers/rocm/rocm_context.h b/include/onnxruntime/core/providers/rocm/rocm_context.h
index 5f04289a8c6e0..f187e0cbb3a89 100644
--- a/include/onnxruntime/core/providers/rocm/rocm_context.h
+++ b/include/onnxruntime/core/providers/rocm/rocm_context.h
@@ -23,21 +23,24 @@ struct RocmContext : public CustomOpContext {
     void* resource = {};
     OrtStatus* status = nullptr;
 
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::hip_stream_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::hip_stream_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch hip stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     hip_stream = reinterpret_cast<hipStream_t>(resource);
 
     resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::miopen_handle_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::miopen_handle_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch miopen handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
     miopen_handle = reinterpret_cast<miopenHandle_t>(resource);
 
     resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_ROCM_RESOUCE_VERSION, RocmResource::rocblas_handle_t, &resource);
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::rocblas_handle_t, &resource);
     if (status) {
       ORT_CXX_API_THROW("failed to fetch rocblas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
diff --git a/include/onnxruntime/core/providers/rocm/rocm_resource.h b/include/onnxruntime/core/providers/rocm/rocm_resource.h
index 53f26c13e93e0..772447a1809d8 100644
--- a/include/onnxruntime/core/providers/rocm/rocm_resource.h
+++ b/include/onnxruntime/core/providers/rocm/rocm_resource.h
@@ -3,7 +3,7 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_ROCM_RESOUCE_VERSION 1
+#define ORT_ROCM_RESOURCE_VERSION 1
 
 enum RocmResource : int {
   hip_stream_t = rocm_resource_offset,
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 5c61963a2f39c..5aafdd149e889 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -473,13 +473,13 @@ typedef struct OrtCUDAProviderOptions {
 
   /** \brief Enable TunableOp for using.
    *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
+   *   This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
    */
   int tunable_op_enable;
 
   /** \brief Enable TunableOp for tuning.
    *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
+   *   This option can be overridden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
    */
   int tunable_op_tuning_enable;
 
@@ -562,13 +562,13 @@ typedef struct OrtROCMProviderOptions {
 
   /** \brief Enable TunableOp for using.
    *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
+   *   This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
    */
   int tunable_op_enable;
 
   /** \brief Enable TunableOp for tuning.
    *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
-   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
+   *   This option can be overridden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
    */
   int tunable_op_tuning_enable;
 
@@ -2798,7 +2798,7 @@ struct OrtApi {
    * "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
    *  Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
    * "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
-   *  It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
+   *  It is not an allocation limit, it is only a limit for extension when requested byte is less than the limit.
    *  When requested bytes is more than the limit, allocator will still return as requested.
    *  Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
    *  Ultimately, the allocation size is determined by the allocation memory request.
@@ -4467,13 +4467,14 @@ struct OrtApi {
    * E.g. a cuda stream or a cublas handle
    *
    * \param context - Kernel context
-   * \param resouce_version - Version of the resource
+   * \param resource_version - Version of the resource
    * \param resource_id - Type of resource
    * \param resource - A pointer to returned resource
    *
    * \since Version 1.16.
    */
-  ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resouce_version, _In_ int resource_id, _Outptr_ void** resource);
+  ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resource_version,
+                  _In_ int resource_id, _Outptr_ void** resource);
 
   /** \brief Set user logging function
    *
@@ -4528,10 +4529,10 @@ struct OrtApi {
   ORT_API2_STATUS(ShapeInferContext_GetAttribute, _In_ const OrtShapeInferContext* context, _In_ const char* attr_name, _Outptr_ const OrtOpAttr** attr);
 
   /**
-   * Set type and shape info of an ouput
+   * Set type and shape info of an output
    *
    * \param[in] context
-   * \param[in] index The index of the ouput
+   * \param[in] index The index of the output
    * \param[out] info Type shape info of the output
    *
    * \since Version 1.17.
diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
index ee60f25da115e..57a64380faeb0 100644
--- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
+++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -403,7 +403,7 @@ using Variadic = TensorArray;
 Note:
 OrtLiteCustomOp inherits from OrtCustomOp to bridge tween a custom func/struct and ort core.
 The lifetime of an OrtLiteCustomOp instance is managed by customer code, not ort, so:
-1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierachy.
+1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierarchy.
 2. OrtLiteCustomFunc and OrtLiteCustomStruct, as two sub-structs, can be released in form of OrtLiteCustomOp since all members are kept in the OrtLiteCustomOp,
    hence memory could still be recycled properly.
 Further, OrtCustomOp is a c struct bearing no v-table, so offspring structs are by design to be of zero virtual functions to maintain cast safety.
diff --git a/java/build.gradle b/java/build.gradle
index 3219b082994ff..8b4d5429b0f70 100644
--- a/java/build.gradle
+++ b/java/build.gradle
@@ -54,7 +54,7 @@ java {
 	targetCompatibility = JavaVersion.VERSION_1_8
 }
 
-// This jar tasks serves as a CMAKE signalling
+// This jar tasks serves as a CMAKE signaling
 // mechanism. The jar will be overwritten by allJar task
 jar {
 }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index f552badd4f83e..b80debdde47c4 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -438,7 +438,7 @@ private static String mapLibraryName(String library) {
   /**
    * Extracts the providers array from the C API, converts it into an EnumSet.
    *
-   * <p>Throws IllegalArgumentException if a provider isn't recognised (note this exception should
+   * <p>Throws IllegalArgumentException if a provider isn't recognized (note this exception should
    * only happen during development of ONNX Runtime, if it happens at any other point, file an issue
    * on <a href="https://github.com/microsoft/onnxruntime">GitHub</a>).
    *
diff --git a/java/src/main/java/ai/onnxruntime/providers/package-info.java b/java/src/main/java/ai/onnxruntime/providers/package-info.java
index 1f1e70a589f3a..33c24c6139f52 100644
--- a/java/src/main/java/ai/onnxruntime/providers/package-info.java
+++ b/java/src/main/java/ai/onnxruntime/providers/package-info.java
@@ -3,5 +3,5 @@
  * Licensed under the MIT License.
  */
 
-/** Classes for controlling the behaviour of ONNX Runtime Execution Providers. */
+/** Classes for controlling the behavior of ONNX Runtime Execution Providers. */
 package ai.onnxruntime.providers;
diff --git a/java/src/test/java/sample/ScoreMNIST.java b/java/src/test/java/sample/ScoreMNIST.java
index 6ecbc5cd56d10..efc7ef9fd6e47 100644
--- a/java/src/test/java/sample/ScoreMNIST.java
+++ b/java/src/test/java/sample/ScoreMNIST.java
@@ -242,7 +242,7 @@ public static void writeDataSKL(float[][] data, int[] indices, float[] values) {
   /**
    * Find the maximum probability and return it's index.
    *
-   * @param probabilities The probabilites.
+   * @param probabilities The probabilities.
    * @return The index of the max.
    */
   public static int pred(float[] probabilities) {
diff --git a/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts b/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
index 1f2b27c7bdea8..717233182ed8a 100644
--- a/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
+++ b/js/web/lib/onnxjs/backends/webgl/glsl-coordinate-lib.ts
@@ -1234,7 +1234,7 @@ export class CoordsGlslLib extends GlslLib {
   }
 
   /**
-   * This is the main function to map from the given texture coordiantes (s,t)
+   * This is the main function to map from the given texture coordinates (s,t)
    * to logical indices for the output
    * There will only be one single variation of this
    * Also see coordsToOffset and offsetToIndices for input-specific versions
diff --git a/js/web/lib/onnxjs/backends/webgl/ops/pack.ts b/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
index 42a275a96fb8a..37ef8c8fe2435 100644
--- a/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
+++ b/js/web/lib/onnxjs/backends/webgl/ops/pack.ts
@@ -85,7 +85,7 @@ function getOutOfBoundsCondition(rank: number, shape: readonly number[], dims: s
 }
 
 /**
- * code snippet to sample input texture with output coordiantes
+ * code snippet to sample input texture with output coordinates
  */
 function getOutput(shape: readonly number[], dims: string[]): string {
   const rank = shape.length;
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
index 326b2d8dc4925..bce8fd118e957 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@@ -19,7 +19,7 @@ using onnxruntime::rnn::detail::Direction;
 using onnxruntime::rnn::detail::MakeDirection;
 
 // The class represents DeepCPU implementation of a long short term memory (LSTM) plus a Bahdanau Attention wraper.
-// The equivilent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
+// The equivalent python usage could be checked int the corresponding op test directory, attention_lstm_data_gen.py.
 // Also please note that detail implementation re-used lot of code from current ONNXRuntime LSTM operator, refactor
 // is needed in future if this is become part of ONNX.
 class DeepCpuAttnLstmOp final : public OpKernel {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h b/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
index 413ef596cd118..2f41746c1d4e7 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
@@ -152,7 +152,7 @@ Status Sample(AllocatorPtr& allocator,
                                                         1,
                                                         generator,
                                                         *sampled_idx));
-  // TODO: update presense_mask()
+  // TODO: update presence_mask()
 #ifdef DEBUG_GENERATION
   dumper->Print("sampled_idx", *sampled_idx);
 #endif
diff --git a/onnxruntime/core/codegen/common/common.cc b/onnxruntime/core/codegen/common/common.cc
index c2ae4ddba584e..818b919e99ef2 100644
--- a/onnxruntime/core/codegen/common/common.cc
+++ b/onnxruntime/core/codegen/common/common.cc
@@ -159,7 +159,7 @@ std::unique_ptr<ComputeCapability> ToCapacity(const onnxruntime::GraphViewer& gr
     ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.ImplicitInputDefs(), process_input_fn));
 
     // Handle outouts
-    // two cases are considerd as outputs
+    // two cases are considered as outputs
     // 1. Output NodeArg is not used by any Node
     // 2. Output NodeArg is used by at least one Node out of this subgraph.
     //    Note a NodeArg can be used by Nodes in and out of the subgraph at the same time.
diff --git a/onnxruntime/core/codegen/mti/common.h b/onnxruntime/core/codegen/mti/common.h
index 87bce55715ee1..d71e740b9284a 100644
--- a/onnxruntime/core/codegen/mti/common.h
+++ b/onnxruntime/core/codegen/mti/common.h
@@ -8,7 +8,7 @@
 
 #define MTI_ASSERT(condition)                                           \
   if (!(condition)) {                                                   \
-    std::string error_msg = "Not satsified: " #condition                \
+    std::string error_msg = "Not satisfied: " #condition                \
                             ": line " +                                 \
                             std::to_string(__LINE__) +                  \
                             " in file " + std::string(__FILE__) + "\n"; \
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
index 3595229bbe132..76c2ad509c401 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
@@ -74,7 +74,7 @@ bool ShouldTryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
-// Therefore, there is a safty check of tensor's schedule
+// Therefore, there is a safety check of tensor's schedule
 bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
@@ -124,7 +124,7 @@ bool TryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to add compute_inline on it.
 // Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safty check of tensor's schedule.
+// Therefore, there is a safety check of tensor's schedule.
 bool TryInlineSchedule(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx) {
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
index 757366b551cf8..4a0781f94d385 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
@@ -34,7 +34,7 @@ bool ShouldTryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
-// Therefore, there is a safty check of tensor's schedule
+// Therefore, there is a safety check of tensor's schedule
 bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
@@ -43,7 +43,7 @@ bool TryVectorization(
 // Check the schedule of tensor
 // If it is not scheduled, try to add compute_inline on it.
 // Note TryInlineSchedule cannot be used with compute_root.
-// Therefore, there is a safty check of tensor's schedule.
+// Therefore, there is a safety check of tensor's schedule.
 bool TryInlineSchedule(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx);
diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
index 6f0ffa14e8abb..2c8250198fa5f 100644
--- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc
@@ -39,7 +39,7 @@ void TVMScheduleBuilder::DumpAllSchedulers() const {
 
     d->ForEach([&stream](const std::string& key, Scheduler* op) {
       stream << "Key " << key
-             << ", Creater " << op->Name() << std::endl;
+             << ", Creator " << op->Name() << std::endl;
     });
 
     ++count;
diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
index af61641a74937..1b45a38e7e24e 100644
--- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
+++ b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h
@@ -13,7 +13,7 @@ namespace tvm_codegen {
 
 using CoordTransFunc = std::function<tvm::Array<tvm::Expr>(const tvm::Array<tvm::Expr>&)>;
 
-// WeightLayout is data layout trasnformer for weight/initializer
+// WeightLayout is data layout transformer for weight/initializer
 class WeightLayout {
  public:
   // Static function to return unique string as a key
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index ad6f666a2d989..a086c90ea4b14 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -56,7 +56,7 @@ LoggingManager* LoggingManager::GetDefaultInstance() {
   return static_cast<LoggingManager*>(DefaultLoggerManagerInstance().load());
 }
 
-// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
+// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
 // and should not have any destruction order issues via pragmas instead.
 // https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
 #ifdef _MSC_VER
diff --git a/onnxruntime/core/common/status.cc b/onnxruntime/core/common/status.cc
index 4ffc7adaac88d..e824a66eaed58 100644
--- a/onnxruntime/core/common/status.cc
+++ b/onnxruntime/core/common/status.cc
@@ -70,7 +70,7 @@ std::string Status::ToString() const {
   return result;
 }
 
-// GSL_SUPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
+// GSL_SUPPRESS(i.22) is broken. Ignore the warnings for the static local variables that are trivial
 // and should not have any destruction order issues via pragmas instead.
 // https://developercommunity.visualstudio.com/content/problem/249706/gslsuppress-does-not-work-for-i22-c-core-guideline.html
 #ifdef _MSC_VER
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 7747058f0d0aa..5dca4cf6c165b 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -1073,7 +1073,7 @@ class PlannerImpl {
 
 #ifdef ORT_ENABLE_STREAM
   // assume we already have a baseline reuse plan (no memory reuse at all)
-  // this funciton will optimize the plan by building a reuse plan with stream safety.
+  // this function will optimize the plan by building a reuse plan with stream safety.
   Status OptimizeReusePlanForMultiStream() {
     InlinedHashMap<NodeIndex, int> dependent_counter;
     for (const auto& it : dependence_graph_) {
@@ -2012,7 +2012,7 @@ class PlannerImpl {
             for (auto* output : node->OutputDefs()) {
               if (output->Exists()) {
                 if (std::find(it->InputDefs().begin(), it->InputDefs().end(), output) != it->InputDefs().end()) {
-                  output_consumed_in_subgraph = false;  // output direclty consumed in current graph
+                  output_consumed_in_subgraph = false;  // output directly consumed in current graph
                   OrtValueIndex output_arg_idx;
                   ORT_THROW_IF_ERROR(ort_value_name_idx_map_.GetIdx(output->Name(), output_arg_idx));
                   // there are two cases we need notification:
diff --git a/onnxruntime/core/framework/allocation_planner.h b/onnxruntime/core/framework/allocation_planner.h
index 10ea5920b8809..aa62f218d9ff6 100644
--- a/onnxruntime/core/framework/allocation_planner.h
+++ b/onnxruntime/core/framework/allocation_planner.h
@@ -53,7 +53,7 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
  public:
   SequentialPlannerContext(ExecutionMode execution_mode, ExecutionOrder execution_order, bool enable_memory_reuse)
       : execution_mode_(execution_mode),
-        exection_order_(execution_order),
+        execution_order_(execution_order),
         enable_memory_reuse_(enable_memory_reuse) {
   }
 
@@ -63,13 +63,13 @@ class SequentialPlannerContext : public ISequentialPlannerContext {
 
   bool IsParallelExecutionEnabled() const override { return execution_mode_ == ExecutionMode::ORT_PARALLEL; }
 
-  ExecutionOrder GetExecutionOrder() const override { return exection_order_; }
+  ExecutionOrder GetExecutionOrder() const override { return execution_order_; }
 
   bool GetEnableMemoryReuse() const override { return enable_memory_reuse_; }
 
  private:
   ExecutionMode execution_mode_ = ExecutionMode::ORT_SEQUENTIAL;
-  ExecutionOrder exection_order_ = ExecutionOrder::DEFAULT;
+  ExecutionOrder execution_order_ = ExecutionOrder::DEFAULT;
   bool enable_memory_reuse_ = true;
 };
 
diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc
index 13948289e1c37..8d15e03c2e5ce 100644
--- a/onnxruntime/core/framework/device_stream_collection.cc
+++ b/onnxruntime/core/framework/device_stream_collection.cc
@@ -93,7 +93,8 @@ class DeviceStreamCollectionImpl {
   const AllocatorMap& allocators_;
   bool is_main_graph_ = false;
   // This is used in ExecutionFrame when memory pattern is enabled, to allocate the peak size memory
-  // labelled this stream in the current thread, instead of the default stream which will be used in all the threads (thus caused thread safe issue)
+  // labeled this stream in the current thread, instead of the default stream which will be used in all the threads
+  // (thus caused thread safe issue)
   std::unique_ptr<Stream> root_stream_;
   OrtDevice root_stream_device_;
   void ReleaseSingleStreamBuffers();
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index 18d210ffd48f7..de571f86f1c77 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -167,7 +167,7 @@ class ExecutionFrame final : public IExecutionFrame {
   }
 
   // This function try retrieve the inferred shapes for the given NodeArg index.
-  // If the retrival is sucessful, this function returns true and false otherwise.
+  // If the retrival is successful, this function returns true and false otherwise.
   bool TryGetInferredShape(int index, TensorShape& shape) const override;
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
diff --git a/onnxruntime/core/framework/partial_graph_execution_state.cc b/onnxruntime/core/framework/partial_graph_execution_state.cc
index a053634adbe35..ce0572927d94a 100644
--- a/onnxruntime/core/framework/partial_graph_execution_state.cc
+++ b/onnxruntime/core/framework/partial_graph_execution_state.cc
@@ -50,7 +50,7 @@ PartialGraphExecutionState::~PartialGraphExecutionState() {
 DeviceStreamCollection* PartialGraphExecutionState::GetDeviceStreamCollection(const SessionState& session_state) {
   if (device_stream_collection_ == nullptr) {
     device_stream_collection_ = session_state.AcquireDeviceStreamCollection();
-    // the life-time of partial graph execution state is in-consistant with session,
+    // the life-time of partial graph execution state is inconsistent with session,
     // so we can't make sure it is safe to return the device stream collection to
     // session when deconstruct partial graph execution state.
     // so let's always delete the stream collections.
diff --git a/onnxruntime/core/framework/sequential_execution_plan.h b/onnxruntime/core/framework/sequential_execution_plan.h
index 62c66bc6f336c..d9472e404c0e4 100644
--- a/onnxruntime/core/framework/sequential_execution_plan.h
+++ b/onnxruntime/core/framework/sequential_execution_plan.h
@@ -106,7 +106,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
   // types of steps:
   // 1. Kernel Launch
   // 2. Activate notification
-  // 3. Wait on a notificaiton
+  // 3. Wait on a notification
   class ExecutionStep {
    public:
     ExecutionStep(NodeIndex node_index) : node_index_(node_index) {}
@@ -122,7 +122,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
    protected:
     NodeIndex node_index_;
   };
-  // LogicStream is a sequence of execution steps that can be executed independetly.
+  // LogicStream is a sequence of execution steps that can be executed independently.
   // The steps within a sequence are executed in order, and happened on the same device.
   struct LogicStream {
     std::vector<std::unique_ptr<ExecutionStep>> steps_;
@@ -160,7 +160,7 @@ struct SequentialExecutionPlan : public ExecutionPlanBase {
   std::vector<size_t> notification_owners;
   // key: notification index.
   // value:  {stream_idx, step_idx}
-  // giving a notificaiton, we used this map to figure out what is the downstream steps it need to trigger.
+  // giving a notification, we used this map to figure out what is the downstream steps it need to trigger.
   InlinedHashMap<onnxruntime::NotificationIndex, std::vector<std::pair<size_t, size_t>>> downstream_map;
 
   size_t num_barriers{0};
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index a374e381a2b0e..aa762ca32fdb4 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -442,7 +442,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
   if (p_kernel->KernelDef().OpName() == "YieldOp") {
     // Do not execute YieldOp (it is an no-op anyways).
     // Decrement the reference count of tensors that are not needed beyond this point.
-    // REVEIW(codemzs): The current model assumes the intermediate tensors that are exported
+    // REVIEW(codemzs): The current model assumes the intermediate tensors that are exported
     // as graph outputs are owned by ORT, the risk of caller freeing the tensor or manipulating tensor
     // memory lingers while the tensor is used downstream after the export.
     ctx.RecycleNodeInputs(idx);
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 46bfc3630303c..8d4db36106f28 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -62,7 +62,7 @@ enum class ExecutionPriority : int {
 
 struct FreeDimensionOverride {
   std::string dim_identifier;
-  FreeDimensionOverrideType dim_identifer_type;
+  FreeDimensionOverrideType dim_identifier_type;
   int64_t dim_value;
 };
 
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 42fb7b392283a..a88f36f63639c 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -22,9 +22,9 @@ using namespace ::onnxruntime::common;
 
 namespace onnxruntime {
 #ifdef ORT_ENABLE_STREAM
-static inline std::string GetWaitKey(const OrtDevice::DeviceType notificaiton_device_type,
+static inline std::string GetWaitKey(const OrtDevice::DeviceType notification_device_type,
                                      const OrtDevice::DeviceType executor_device_type) {
-  return std::to_string(notificaiton_device_type) + ":" + std::to_string(executor_device_type);
+  return std::to_string(notification_device_type) + ":" + std::to_string(executor_device_type);
 }
 
 class StreamCommandHandleRegistryImpl : public IStreamCommandHandleRegistry {
diff --git a/onnxruntime/core/framework/sparse_tensor.cc b/onnxruntime/core/framework/sparse_tensor.cc
index a3bcea4762d3e..4e40e3dd81ca2 100644
--- a/onnxruntime/core/framework/sparse_tensor.cc
+++ b/onnxruntime/core/framework/sparse_tensor.cc
@@ -551,7 +551,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
   }
 
   if (Values().Shape().Size() > 0) {
-    // This instance may either have a contigious buffer which we can copy in one shot
+    // This instance may either have a contiguous buffer which we can copy in one shot
     // or it can point to users buffers, in which case we have to copy each buffer individually
     // strings can not be memcpyed albeit always on CPU.
     if (p_data_ != nullptr) {
@@ -569,7 +569,7 @@ Status SparseTensor::Copy(const IDataTransfer& data_transfer, SparseTensor& dst_
         ORT_RETURN_IF_ERROR(data_transfer.CopyTensor(src, dst));
       }
     } else {
-      // non-contiguos buffer
+      // non-contiguous buffer
       if (is_string) {
         CopyStrings(Values(), result_values);
       } else {
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index aabfc0487f3e0..e5197adcb94ec 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -151,7 +151,7 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
 // the data location is external. i.e. it does not load the external data.
 // However if AttributeProto contains SparseTensorProto then it converts the data into dense tensor proto
 // (including loading external data when applicable).
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 // tensor_name specifies the name for the new TensorProto TensorProto
 common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& node,
                                               const std::filesystem::path& model_path,
@@ -165,7 +165,7 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
 // Convert a SparseTensorProto to a dense TensorProto
 // If the SparseTensorProto contains external data then it loads the data and converts to dense tensor proto
 // The resulting TensorProto will contain the data as raw data.
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseTensorProto& sparse,
                                                    const std::filesystem::path& model_path,
                                                    ONNX_NAMESPACE::TensorProto& dense);
@@ -174,7 +174,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 // Convert a TensorProto to a SparseTensorProto
 // If the tensorproto contains external data then it loads the data and converts to sparse tensor
 // The resulting SparseTensorProto will contain the data as raw data
-// model_path is used for contructing full path for external_data
+// model_path is used for constructing full path for external_data
 common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense,
                                               const std::filesystem::path& model_path,
                                               ONNX_NAMESPACE::SparseTensorProto& sparse);
diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h
index 17cf9671b70eb..afdb5a2cb27f5 100644
--- a/onnxruntime/core/framework/utils.h
+++ b/onnxruntime/core/framework/utils.h
@@ -47,7 +47,7 @@ void ConstructStrings(void* p_data, int64_t elements);
 
 /// <summary>
 /// Destroy std::string objects in the contiquous chunk of memory
-/// by explicitely invoking ~string();
+/// by explicitly invoking ~string();
 /// </summary>
 /// <param name="p_data"></param>
 /// <param name="elements"></param>
diff --git a/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h b/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
index 4a784a1a49109..79c582279f2c8 100644
--- a/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
+++ b/onnxruntime/core/mickey/gemm/warp/quantb_meta_loader.h
@@ -37,12 +37,12 @@ void weightsMinuEight2Half(uint32_t const &weights,
   //
   // For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
   // to 0x6400, essentially we set the exponent bits to 25, effective
-  // exp = 25 - 15 = 10, with explicity hight bit, the value is
+  // exp = 25 - 15 = 10, with explicitly hight bit, the value is
   //   2^10 + q_w.
   //
   // Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
   // high bits to 0x5400, essentially we set the exponent bits to 21,
-  // effective exp = 21 - 15 = 6, with explicity hight bit, the value
+  // effective exp = 21 - 15 = 6, with explicitly hight bit, the value
   // is 2^6 + q_w.
   //
   // 1.125 instruction per weight, 9 instructions in total.
@@ -86,12 +86,12 @@ void weights2Half([[maybe_unused]] uint32_t const &weights,
   //
   // For element 0, 1, 4, 5, we have 0x000?000?, set the high bits
   // to 0x6400, essentially we set the exponent bits to 25, effective
-  // exp = 25 - 15 = 10, with explicity hight bit, the value is
+  // exp = 25 - 15 = 10, with explicitly hight bit, the value is
   //   2^10 + q_w.
   //
   // Similarly for element 2, 3, 6, 7, we have 0x00?000?, set the
   // high bits to 0x5400, essentially we set the exponent bits to 21,
-  // effective exp = 21 - 15 = 6, with explicity hight bit, the value
+  // effective exp = 21 - 15 = 6, with explicitly hight bit, the value
   // is 2^6 + q_w.
   //
   // 1.125 instruction per weight, 9 instructions in total.
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 5d2c35fbfb406..ec79641559c6b 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -61,7 +61,7 @@ Routine Description:
 
     This implementation supports sampling a portion of the convolution
     patches. This avoids the need to allocate very large buffers to store
-    all of the convolution patches at once, when the underyling GEMM
+    all of the convolution patches at once, when the underlying GEMM
     implementation will already break up the operation into panels. Multiple
     threads can also be used to process different portions of the image.
 
@@ -267,7 +267,7 @@ Routine Description:
 
     This implementation supports sampling a portion of the convolution
     patches. This avoids the need to allocate very large buffers to store
-    all of the convolution patches at once, when the underyling GEMM
+    all of the convolution patches at once, when the underlying GEMM
     implementation will already break up the operation into panels. Multiple
     threads can also be used to process different portions of the image.
 
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
index ca744adddbeec..267a82b72670c 100644
--- a/onnxruntime/core/optimizer/attention_fusion_helper.h
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -1118,8 +1118,8 @@ bool CheckNodesInPathV(const Graph& graph, const Node& reshape, const Node& tran
   head_size = v_reshape_shape[3];
 
   // Check reshape for attention output has shape input (0, 0, -1) or (0, 0, N*H)
-  // In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the correspondig
-  // initializer. We need to get the shape information from the input of concat.
+  // In DistilBert, the reshape after qkv paths can not be fused during reshape fusion, so we do not have the
+  // corresponding initializer. We need to get the shape information from the input of concat.
   InlinedVector<int64_t> reshape_shape;
   if (!optimizer_utils::AppendTensorFromInitializer(graph, *(reshape.InputDefs()[1]), reshape_shape)) {
     if (CheckDistilBertReshapeShape(graph, reshape, hidden_size, record_node_idx, logger)) {
diff --git a/onnxruntime/core/optimizer/free_dim_override_transformer.cc b/onnxruntime/core/optimizer/free_dim_override_transformer.cc
index 0d162b5238b18..bce73a0dcec45 100644
--- a/onnxruntime/core/optimizer/free_dim_override_transformer.cc
+++ b/onnxruntime/core/optimizer/free_dim_override_transformer.cc
@@ -22,9 +22,9 @@ FreeDimensionOverrideTransformer::FreeDimensionOverrideTransformer(gsl::span<con
     : GraphTransformer("FreeDimensionOverrideTransformer") {
   for (const auto& o : overrides_to_apply) {
     // Convert to lowercase to perform case-insensitive comparisons later
-    if (o.dim_identifer_type == FreeDimensionOverrideType::Denotation) {
+    if (o.dim_identifier_type == FreeDimensionOverrideType::Denotation) {
       dimension_override_by_denotation_.emplace(ToLower(o.dim_identifier), o.dim_value);
-    } else if (o.dim_identifer_type == FreeDimensionOverrideType::Name) {
+    } else if (o.dim_identifier_type == FreeDimensionOverrideType::Name) {
       dimension_override_by_name_.emplace(o.dim_identifier, o.dim_value);
     } else {
       ORT_THROW("Invalid free dimension override.");
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
index 959fcd6efdc3c..67ebc22dab41d 100644
--- a/onnxruntime/core/optimizer/insert_cast_transformer.cc
+++ b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -284,10 +284,12 @@ class RemoveDuplicateCastTransformer : public GraphTransformer {
  private:
   static bool UnsafeCast(DataType src_type, DataType dst_type, const Node& node) {
     // This is not a complete cast optimisation pass, and is more conservative than it could be.
-    // For instance, certain integral -> floating point casts could be optimised but this is left to an explicit cast optimisation pass.
+    // For instance, certain integral -> floating point casts could be optimized but
+    // this is left to an explicit cast optimisation pass.
 
     // The comparison with "InsertedPrecisionFreeCast_" reflects cast nodes that are inserted by InsertCastTransformer.
-    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
+    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and
+    // downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
     auto src_type_group = GetTypeGroup(src_type);
     auto dst_type_group = GetTypeGroup(dst_type);
     if (Unknown == src_type_group || Unknown == dst_type_group) {
diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
index d4ed9c4e26cc6..bdb6a44bddaaf 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -1258,7 +1258,7 @@ static int EstimateTransposeValueCost(const api::GraphRef& graph, std::string_vi
   std::unique_ptr<api::NodeRef> producer_node = graph.GetNodeProducingOutput(input);
 
   if (producer_node != nullptr) {
-    // this handles cancelling out a Transpose or Squeeze added to a shared initializer that was updated
+    // this handles canceling out a Transpose or Squeeze added to a shared initializer that was updated
     // by TransposeInputImpl Case 1 or UnqueezeInput Case 1.
     //   - if a shared initializer is not broadcast, we have <updated initializer> -> Transpose -> DQ
     //   - if a shared initializer is broadcast, we have <updated initializer> -> Transpose -> Squeeze -> DQ and need
@@ -1992,7 +1992,7 @@ static bool HandleTile(HandlerArgs& args) {
 
 constexpr HandlerInfo tile_handler = {&FirstInput, &HandleTile};
 
-// Helper to remove cancelling Transpose -> Transpose or
+// Helper to remove canceling Transpose -> Transpose or
 // Transpose -> Reshape nodes.
 static void RemoveCancelingTransposeNodes(HandlerArgs& args) {
   // Input to 1st transpose
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index eb6a10074f1db..be0e57c5c0543 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -118,7 +118,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
     ACLImportMemory(tbatch_norm.b->allocator(), (void*)b_data, B->Shape().Size() * 4);
     ACLImportMemory(tbatch_norm.scale->allocator(), (void*)scale_data, S->Shape().Size() * 4);
 
-    // allocate space for input tensor to accomodate paddings and strides
+    // allocate space for input tensor to accommodate paddings and strides
     tbatch_norm.in->allocator()->allocate();
 
     tbatch_norm.layer = std::move(layer);
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index 8fbcba3ed87a7..01d9bc0302c3a 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -121,7 +121,7 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
-    // allocate space for input tensor to accomodate paddings and strides
+    // allocate space for input tensor to accommodate paddings and strides
     tpool.in->allocator()->allocate();
 
     tpool.layer = std::move(layer);
diff --git a/onnxruntime/core/providers/armnn/activation/activations.cc b/onnxruntime/core/providers/armnn/activation/activations.cc
index 93017c26271f7..7ab7a14f7e206 100644
--- a/onnxruntime/core/providers/armnn/activation/activations.cc
+++ b/onnxruntime/core/providers/armnn/activation/activations.cc
@@ -56,7 +56,7 @@ Status Relu<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Relu::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/math/gemm.h b/onnxruntime/core/providers/armnn/math/gemm.h
index 4f77c4afb725a..039a9c3b75adb 100644
--- a/onnxruntime/core/providers/armnn/math/gemm.h
+++ b/onnxruntime/core/providers/armnn/math/gemm.h
@@ -130,7 +130,7 @@ class Gemm : public onnxruntime::Gemm<T> {
       armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
       fc_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-      // Optimise ArmNN network
+      // Optimize ArmNN network
       armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Gemm::run->GetDeviceSpec());
 
       if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/batch_norm.cc b/onnxruntime/core/providers/armnn/nn/batch_norm.cc
index e9d8e6fb47852..9a7821d81bdb1 100755
--- a/onnxruntime/core/providers/armnn/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/armnn/nn/batch_norm.cc
@@ -89,7 +89,7 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, BatchNorm::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/conv.cc b/onnxruntime/core/providers/armnn/nn/conv.cc
index 674e927ffc324..db261e67ecd00 100644
--- a/onnxruntime/core/providers/armnn/nn/conv.cc
+++ b/onnxruntime/core/providers/armnn/nn/conv.cc
@@ -266,7 +266,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
       activation->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
     }
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Conv::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/armnn/nn/pool.cc b/onnxruntime/core/providers/armnn/nn/pool.cc
index c4eeb17779fcb..9d25b4eed2db4 100644
--- a/onnxruntime/core/providers/armnn/nn/pool.cc
+++ b/onnxruntime/core/providers/armnn/nn/pool.cc
@@ -161,7 +161,7 @@ Status Pool<T, PoolType>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, Pool::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
@@ -250,7 +250,7 @@ Status MaxPoolV8<T>::Compute(OpKernelContext* context) const {
     armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
     pool_armnn->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
 
-    // Optimise ArmNN network
+    // Optimize ArmNN network
     armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*myNetwork, {armnn::Compute::CpuAcc}, MaxPoolV8::run->GetDeviceSpec());
 
     if (optNet == nullptr) {
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
index 91e7955aa9fbe..a602a85fc2737 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc
@@ -290,9 +290,9 @@ std::unique_ptr<Tensor> Transpose(const Tensor& input, const TensorShape& input_
   // and it will de-allocate the memory for this intermediate tensor when it goes out of scope
   std::unique_ptr<Tensor> output = std::make_unique<Tensor>(input.DataType(), output_dims, allocator);
 
-  TensorShape overriden_shape(input_shape_override);
+  TensorShape overridden_shape(input_shape_override);
 
-  auto status = device_transpose_func(permutation, input, *output, &overriden_shape, einsum_cuda_assets);
+  auto status = device_transpose_func(permutation, input, *output, &overridden_shape, einsum_cuda_assets);
 
   if (!status.IsOK()) {
     ORT_THROW(ONNXRUNTIME, FAIL, "Einsum op: Transpose failed: ", status.ErrorMessage());
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
index a362bb06220d8..343ed485a150a 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc
@@ -209,7 +209,7 @@ std::unique_ptr<Tensor> EinsumTypedComputeProcessor<T>::PairwiseOperandProcess(c
     if (current_left && IsTransposeReshapeForEinsum(left_permutation,
                                                     current_left->Shape().GetDims(),
                                                     reshaped_dims)) {
-      // This can be done because curent_* tensors (if they exist) and output tensors are
+      // This can be done because current_* tensors (if they exist) and output tensors are
       // intermediate tensors and cannot be input tensors to the Einsum node itself
       // (which are immutable).
       // Covered by ExplicitEinsumAsTensorContractionReshapeLeft.
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index ac1bb111494fd..ead2ccaef002e 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -135,7 +135,7 @@ static void PreCalcForBilinearInterpolate(const int64_t height, const int64_t wi
           T w3 = ly * hx;
           T w4 = ly * lx;
 
-          // save weights and indeces
+          // save weights and indices
           PreCalc<T> pc;
           pc.pos1 = y_low * width + x_low;
           pc.pos2 = y_low * width + x_high;
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 2913f4ac32b6e..7a27b04ece7cf 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -317,7 +317,7 @@ Status SequenceConstruct::Compute(OpKernelContext* context) const {
     const auto* X = context->Input<Tensor>(input_idx);
     if (input_idx > 0 && X->DataType() != first_dtype) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Violation of the requirment that all input tensors must have the same data type.");
+                             "Violation of the requirement that all input tensors must have the same data type.");
     }
   }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/unique.cc b/onnxruntime/core/providers/cpu/tensor/unique.cc
index ab99d87da83fd..92c163a0f08a1 100644
--- a/onnxruntime/core/providers/cpu/tensor/unique.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unique.cc
@@ -51,7 +51,7 @@ ONNX_OPERATOR_SET_SCHEMA(
             1,
             "indices",
             "A 1-D INT64 tensor "
-            "containing indices of 'Y' elements' first occurance in 'X'. "
+            "containing indices of 'Y' elements' first occurrence in 'X'. "
             "When 'axis' is provided, it contains indices to subtensors in input 'X' on the 'axis'. "
             "When 'axis' is not provided, it contains indices to values in the flattened input tensor. ",
             "tensor(int64)",
diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
index 314aa1062f1b0..2189af8e0ee2d 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.cc
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -60,7 +60,7 @@ void* CUDAExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 58e57572131b1..14b75d2383b58 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -179,7 +179,7 @@ Status CudaStream::CleanUpOnRunEnd() {
 }
 
 void* CudaStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_CUDA_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_CUDA_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case CudaResource::cuda_stream_t:
diff --git a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
index 6cb65ea8e739c..8bb87035cdc6d 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh
@@ -30,7 +30,7 @@ dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
   uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
 
   // In the vectorized case we want to trade off allowing more of the buffers to be accessed
-  // in a vectorized way against wanting a larger block size to get better utilisation.
+  // in a vectorized way against wanting a larger block size to get better utilization.
   // In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
   // of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
   // allowing a larger block size.
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index e05786248cbcf..764feadcf4cb3 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -15,7 +15,7 @@ namespace onnxruntime {
 namespace cuda {
 
 // Op Set 11 for Conv only update document to clearify default dilations and strides value.
-// which are already convered by op set 11 cpu versoin, so simply add declaration.
+// which are already convered by op set 11 cpu version, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                             \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
@@ -269,7 +269,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       // especially for EXHAUSTIVE algo search which may result in a better algo selection.
       // ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to
       // inference build (EXHAUSTIVE, 32M workspace size). We observed better perf when we pad input shape
-      // [N,C,D] to [N,C,1,D], expecially on A100, and especially for ConvGrad.
+      // [N,C,D] to [N,C,1,D], especially on A100, and especially for ConvGrad.
       // PyTorch also pads to [N,C,1,D]. For inference build, we still pad it to [N, C, D, 1] as this seems
       // to be the sweet spot for all algo search options: EXHAUSTIVE, HEURISTIC, and DEFAULT.
       // See PR #7348 and #7702 for more context.
diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
index 537ad0a8b9efe..10053c630ab66 100644
--- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
+++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
@@ -20,7 +20,7 @@
 
 namespace onnxruntime {
 namespace cuda {
-  
+
 template <typename T>
 __device__ T bilinear_interpolate(
     const T* bottom_data,
@@ -73,8 +73,8 @@ __device__ T bilinear_interpolate(
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
   T val = is_mode_avg
-            ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)  // mode Avg
-            : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
+              ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)             // mode Avg
+              : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
 
   return val;
 }
@@ -116,7 +116,7 @@ __global__ void RoIAlignForward(
 
     T roi_width = roi_end_w - roi_start_w;
     T roi_height = roi_end_h - roi_start_h;
-    if (!half_pixel) { // backward compatiblity
+    if (!half_pixel) {  // backward compatibility
       // Force malformed ROIs to be 1x1
       roi_width = max(roi_width, (T)1.);
       roi_height = max(roi_height, (T)1.);
@@ -129,29 +129,29 @@ __global__ void RoIAlignForward(
 
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
-        ? sampling_ratio
-        : _Ceil(roi_height / pooled_height); // e.g., = 2
+                             ? sampling_ratio
+                             : _Ceil(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
 
     // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
 
     T output_val = 0.;
     bool max_flag = false;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
     {
       const T y = roi_start_h + ph * bin_size_h +
-          static_cast<T>(iy + .5f) * bin_size_h /
-              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
         const T x = roi_start_w + pw * bin_size_w +
-            static_cast<T>(ix + .5f) * bin_size_w /
-                static_cast<T>(roi_bin_grid_w);
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
 
         T val = bilinear_interpolate(
             offset_bottom_data, height, width, y, x, is_mode_avg, index);
-        
+
         if (is_mode_avg) {
           output_val += val;
         } else {
@@ -174,24 +174,24 @@ __global__ void RoIAlignForward(
 
 template <typename T>
 void RoiAlignImpl(
-  cudaStream_t stream,
-  const int64_t nthreads,
-  const T* bottom_data,
-  const T spatial_scale,
-  const int64_t channels,
-  const int64_t height,
-  const int64_t width,
-  const int64_t pooled_height,
-  const int64_t pooled_width,
-  const int64_t sampling_ratio,
-  const T* bottom_rois,
-  int64_t roi_cols,
-  T* top_data,
-  const bool is_mode_avg,
-  const bool half_pixel,
-  const int64_t* batch_indices_ptr) {
-    int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock)); 
-    RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+    cudaStream_t stream,
+    const int64_t nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio,
+    const T* bottom_rois,
+    int64_t roi_cols,
+    T* top_data,
+    const bool is_mode_avg,
+    const bool half_pixel,
+    const int64_t* batch_indices_ptr) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
+  RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       nthreads,
       bottom_data,
       spatial_scale,
@@ -206,30 +206,30 @@ void RoiAlignImpl(
       top_data,
       is_mode_avg,
       half_pixel,
-      batch_indices_ptr);    
+      batch_indices_ptr);
 }
 
-#define SPECIALIZED_IMPL(T)                     \
-  template void RoiAlignImpl<T>(                \
-        cudaStream_t stream,              \
-        const int64_t nthreads,                 \
-        const T* bottom_data,                   \
-        const T spatial_scale,                  \
-        const int64_t channels,                 \
-        const int64_t height,                   \
-        const int64_t width,                    \
-        const int64_t pooled_height,            \
-        const int64_t pooled_width,             \
-        const int64_t sampling_ratio,           \
-        const T* bottom_rois,                   \
-        int64_t roi_cols,                       \
-        T* top_data,                            \
-        const bool is_mode_avg,                 \
-        const bool half_pixel,                  \
-        const int64_t* batch_indices_ptr);
+#define SPECIALIZED_IMPL(T)         \
+  template void RoiAlignImpl<T>(    \
+      cudaStream_t stream,          \
+      const int64_t nthreads,       \
+      const T* bottom_data,         \
+      const T spatial_scale,        \
+      const int64_t channels,       \
+      const int64_t height,         \
+      const int64_t width,          \
+      const int64_t pooled_height,  \
+      const int64_t pooled_width,   \
+      const int64_t sampling_ratio, \
+      const T* bottom_rois,         \
+      int64_t roi_cols,             \
+      T* top_data,                  \
+      const bool is_mode_avg,       \
+      const bool half_pixel,        \
+      const int64_t* batch_indices_ptr);
 
 SPECIALIZED_IMPL(float)
 SPECIALIZED_IMPL(double)
-  
-} // namespace cuda
-} // namespace onnxruntime
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index bc78e577c5052..c921339ee6f33 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -115,7 +115,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(cudnn_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
   auto indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes, stream);
 
-  // need to allocate a separate buffer for ArgMin/ArgMax comparsion output
+  // need to allocate a separate buffer for ArgMin/ArgMax comparison output
   auto output_count = output_shape.Size();
 
   if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_NO_INDICES) {
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index e788f24052985..a96d4c82a7fdc 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -234,15 +234,15 @@ __global__ void _ResizeNearestKernel(
 
   int output_index = static_cast<int>(id);
   int input_index = 0;
-  int extrapolation_occured = 0;
+  int extrapolation_occurred = 0;
   for (int axis = 0; axis < rank; ++axis) {
     int dim = 0;
     output_div_pitches[axis].divmod(output_index, dim, output_index);
     const NearestMappingInfo& mi = dims_mapping[prefix_dim_sum[axis] + dim];
-    extrapolation_occured += mi.extrapolate_;
+    extrapolation_occurred += mi.extrapolate_;
     input_index += input_strides[axis] * mi.origin_;
   }
-  output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
+  output_data[id] = extrapolation_occurred ? extrapolation_value : input_data[input_index];
 }
 
 struct LinearMappingInfo {
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
index 6344845359b32..602514d1c8227 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu
@@ -145,7 +145,7 @@ bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const cu
         (input_dims[3] % num_elements_per_thread) == 0 &&
         input_dims[1] <= prop.maxGridSize[1] &&
         input_dims[0] <= prop.maxGridSize[2]) {
-      // There are 2 constrains when luanching the kernels
+      // There are 2 constrains when launching the kernels
       // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
       // 2. block_size_y * num_block_ext >= input_dims[2]
       int64_t block_size_x = input_dims[3] / num_elements_per_thread;
@@ -261,7 +261,7 @@ bool CanDoTranspose4DParallelizeOneElementPerThread(const cudaDeviceProp& prop,
     if (input_dims[3] <= prop.maxThreadsPerBlock &&
         input_dims[1] <= prop.maxGridSize[1] &&
         input_dims[0] <= prop.maxGridSize[2]) {
-      // There are 2 constrains when luanching the kernels
+      // There are 2 constrains when launching the kernels
       // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
       // 2. block_size_y * num_block_ext >= input_dims[2]
       int64_t block_size_x = input_dims[3];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
index 0bc543c56f7d1..a0c9289a87156 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorFusedMatMul.cpp
@@ -44,7 +44,7 @@ class DmlOperatorFusedMatMul : public DmlOperator
 
         // At this point, we have manipulated input/output shapes and strides and
         // we do not care about actual input shapes present in the model (.onnx file).
-        // Create the TensorDesc with the manipulated input shapes becuase we don't want incorrect
+        // Create the TensorDesc with the manipulated input shapes because we don't want incorrect
         // broadcasting to be happen inside TensorDesc constructor.
         std::vector<std::optional<uint32_t>> inputIndices = { 0, 1, std::nullopt };
         gsl::span<const uint32_t> inputShapes[2] = {sizesA, sizesB};
diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
index 3271dab13f675..ffda84921a3ee 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
@@ -344,7 +344,7 @@ Status DnnlExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse
         auto input_tensor = ctx.GetInput(i);
         auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
         auto shape = tensor_info.GetShape();
-        // dnnl expectes non-const data
+        // dnnl expects non-const data
         void* inputBuffer = const_cast<void*>(input_tensor.GetTensorRawData());
         inputs.emplace(
             input_name,
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
index 5db52f29a93cf..01f44e91fd49c 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
@@ -431,7 +431,7 @@ bool DnnlMatMulIntegerNodeCapability::IsDimensionSupported(const Node* node, con
     }
   }
 
-  // if shape nullptr, not enough information to reject it. attempt to run it (no gaurantee)
+  // if shape nullptr, not enough information to reject it. attempt to run it (no guarantee)
   if (node_inputs[0]->Shape() == nullptr || node_inputs[1]->Shape() == nullptr) {
     return true;
   }
@@ -465,7 +465,7 @@ bool DnnlSumNodeCapability::Supported(const Node* node, const GraphViewer& graph
 }
 
 // OneDNN version of Sum does not support Numpy style broadcasting.
-// If the dimentions of all inputs do not match return false
+// If the dimensions of all inputs do not match return false
 bool DnnlSumNodeCapability::IsDimensionSupported(const Node* node) const {
   auto node_inputs = node->InputDefs();
   // find first non-null shape
@@ -615,7 +615,7 @@ bool DnnlReshapeNodeCapability::Supported(const Node* node, const GraphViewer& g
 }
 bool DnnlReshapeNodeCapability::IsDimensionSupported(const Node* node) const {
   auto node_inputs = node->InputDefs();
-  // We can not reshape a one dimentional tensor to a scalar output
+  // We can not reshape a one dimensional tensor to a scalar output
   if (node_inputs[1]->Shape() != nullptr &&
       node_inputs[1]->Shape()->dim_size() == 1 &&
       node_inputs[1]->Shape()->dim(0).dim_value() == 0) {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
index 831b10c3e147f..1af9e503e7816 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
@@ -32,9 +32,9 @@ class DnnlConv {
 
  private:
   /*
-   * Return the infered padding.
+   * Return the inferred padding.
    *
-   * The padding will be based on the specified padding or will infered based on the
+   * The padding will be based on the specified padding or will inferred based on the
    * Onnx 'auto_pad' attributes.
    *
    * This will return the padding in the format specified in the Onnx specification.
@@ -47,9 +47,9 @@ class DnnlConv {
                                       const dnnl::memory::dims& dilations,
                                       const std::vector<int64_t>& kernel_shape,
                                       const dnnl::memory::dims& strides);
-  /* Get the padding left values from the infered pads */
+  /* Get the padding left values from the inferred pads */
   dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
-  /* Get the padding right values from the infered pads */
+  /* Get the padding right values from the inferred pads */
   dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
 
   /*
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
index 21218e24c17d6..e05693f3e5f2e 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.cc
@@ -40,7 +40,7 @@ ConvGrad: (According to OnnxRuntime discovered using code inspection and Onnx do
 
 Attributes (auto_pad, dilations, group, kernel_shap, pads, and strides) should be the same as the forward pass Conv operator
 
-To acheive Everything specified in the OnnxRuntime ConvGrad we must use both:
+To achieve Everything specified in the OnnxRuntime ConvGrad we must use both:
 1) dnnl::convolution_backward_data - used to calculate (dX) diff_src
 2) dnnl::convolution_backward_weights - used to calculate (dW) diff_weights and (dB) diff_bias
 */
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
index 3a27788745ef0..c45c85859c25e 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
@@ -39,9 +39,9 @@ class DnnlConvGrad {
   std::vector<int64_t> GetKernelShape(DnnlNode& node);
   /* Get the 'pads' attribute */
   dnnl::memory::dims GetPads(DnnlNode& node, ConvShape shape);
-  /* Get the padding left values from the infered pads */
+  /* Get the padding left values from the inferred pads */
   dnnl::memory::dims GetPaddingLeft(const std::vector<int64_t>& onnx_padding, ConvShape shape);
-  /* Get the padding right values from the infered pads */
+  /* Get the padding right values from the inferred pads */
   dnnl::memory::dims GetPaddingRight(const std::vector<int64_t>& onnx_padding, ConvShape shape);
   /*
    * Get the 'dilations' attribute.
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
index 074df058806e5..ac668aad1bb4a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_dequantizelinear.cc
@@ -68,7 +68,7 @@ void DnnlDequantizeLinear::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode&
   auto dst_md = dnnl::memory::desc(x_md.get_dims(), node.Output(OUT_Y).Type(), dnnl::memory::format_tag::any);
   dnnl::memory dst_mem;
 
-  // If zero point exists and we are NOT dequantizing int32, then substract zp from x and scale
+  // If zero point exists and we are NOT dequantizing int32, then subtract zp from x and scale
   if (isZeroPointUseful && (x_mem.get_desc().get_data_type() != dnnl::memory::data_type::s32)) {
     // Get Zero point
     auto x_zp_mem = sp.GetMemory(node.Input(IN_X_ZERO_POINT));
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
index 54528011850be..82a9e9f3ec898 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_matmul.cc
@@ -126,7 +126,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
       }
 
       // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
+      // that will have the correct dimensions and correct memory::format
       transposedA_md = dnnl::memory::desc(transposedA_dims, node.Input(IN_A).Type(), sp.GetDnnlFormat(transposedA_dims.size()));
       transposedA_mem = dnnl::memory(transposedA_md, eng, nullptr);
       void* handle = intermediateA_mem.get_data_handle();
@@ -146,7 +146,7 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
       }
 
       // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-      // that will have the correct dimentions and correct memory::format
+      // that will have the correct dimensions and correct memory::format
       transposedB_md = dnnl::memory::desc(transposedB_dims, node.Input(IN_B).Type(), sp.GetDnnlFormat(transposedB_dims.size()));
       transposedB_mem = dnnl::memory(transposedB_md, eng, nullptr);
       void* handle = intermediateB_mem.get_data_handle();
@@ -193,8 +193,8 @@ void DnnlMatMul::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
   create a post op binary with possible unsqueezing in order to make sure onednn properly broadcast
   current limitation
   1. is no unsqueeze for matmul output as it is not exposed due to post op fusion
-  2. the third input has to be reordered to plain format (eg, no memory format propogation if the third input is internal to subgraph)
-  3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physcial layout is not plain format
+  2. the third input has to be reordered to plain format (eg, no memory format propagation if the third input is internal to subgraph)
+  3. adding 1s to front (unsqueeze/expand) in logical dims would possibly fail if physical layout is not plain format
   */
   dnnl::primitive_attr attr;
   if (has_postop_fusion) {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
index f49fdd7e9bde1..b19411e61767c 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_reduce.cc
@@ -135,16 +135,16 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
    * shape reduction. For this reason we have code paths that are taken if the source dimensions and
    * destination dimensions are equal that will not call the reduction op.
    *
-   * "ReduceLogSum" is equivelent to Log(ReduceSum(input))
+   * "ReduceLogSum" is equivalent to Log(ReduceSum(input))
    *   - if the reduction op is called then the eltwise_log post op will added to the reduction primitive.
    *   - if the reduction op is not called then the eltwise_log primitive is added as its own primitive
    *   - NOTE "ReduceLogSum" follows the code flow of "All other reduce ops" with the exception of the added
    *          post op and an extra check if src_dims == dest_dims.
-   * "ReduceLogSumExp" is equivelent to Log(ReduceSum(Exp(input)))
+   * "ReduceLogSumExp" is equivalent to Log(ReduceSum(Exp(input)))
    *   - if the reduction op is called then the eltwise_exp primitive is added before the reduction op
    *     the eletwise_log post op will be added to the reduction primitive
    *   - if the reduction op is not called then the input is not modified since Log(Exp(input) == input
-   * "ReduceSumSquare" is equivelent to ReduceSum(Square(input))
+   * "ReduceSumSquare" is equivalent to ReduceSum(Square(input))
    *   - the eltwise_square primitive is added before the reduction op
    *   - if the source and destination dimensions are not equal the reduction op is called
    * All other reduce ops
@@ -298,7 +298,7 @@ void DnnlReduce::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
     dnnl::memory squeeze_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
     // if the src and dst dims are equal then we will have a valid data handle here.
     // Otherwise we must get the data handle at runtime using the AddReshape function.
-    // reading the data handle directy is more efficent if is it possible.
+    // reading the data handle directly is more efficient if is it possible.
     if (!src_and_dst_dims_equal) {
       squeeze_mem.set_data_handle(reduce_dst_mem.get_data_handle());
     } else {
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
index f97268465e46e..a7e49b54d4507 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
@@ -65,7 +65,7 @@ class DnnlSubgraphPrimitive {
   dnnl::memory::desc GetOutputInfo(std::string name);
   bool IsScalarOutput(const std::string& name);
   bool IsDynamic();
-  // All Scalar inputs are automatically converterted to a one dimentional tensor when used in OneDNN
+  // All Scalar inputs are automatically converterted to a one dimensional tensor when used in OneDNN
   // If the input being a scalar affects the operator this function can be used to determine if the
   // original input from ORT was a scalar.
   bool IsScalar(const DnnlTensor& tensor);
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
index 3a7f45c72f27f..b74dbf97a2547 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_transpose.cc
@@ -56,7 +56,8 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
     strides_inverse.push_back(strides[ndata_dims - i - 1]);
   }
 
-  // Memory descriptor describes the memory reorder but will not have the correct output dimentions or the correct dnnl::memory::format
+  // Memory descriptor describes the memory reorder but will not have the correct output dimensions
+  // or the correct dnnl::memory::format
   dnnl::memory::desc intermediate_md = dnnl::memory::desc(data_dims, node.Input(IN_DATA).Type(), strides);
   dnnl::memory intermediate_mem = dnnl::memory(intermediate_md, dnnl_engine);
 
@@ -65,7 +66,7 @@ void DnnlTranspose::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
                                        {DNNL_ARG_TO, intermediate_mem}});
 
   // The reorder from above will get the memory in the right order. The next few lines will create a memory and memory descriptor
-  // that will have the correct dimentions and correct memory::format
+  // that will have the correct dimensions and correct memory::format
   dnnl::memory::desc transposed_md = dnnl::memory::desc(transposed_dims, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(data_dims.size()));
   dnnl::memory transposed_mem = dnnl::memory(transposed_md, dnnl_engine, nullptr);
   void* handle = intermediate_mem.get_data_handle();
diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
index 0693eea056416..c9db31e8744a7 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
@@ -42,7 +42,7 @@ void* MIGraphXExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
index 9c5bb4ecf5c97..e8e349af75aba 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
@@ -123,7 +123,7 @@ Status MIGraphXStream::CleanUpOnRunEnd() {
 }
 
 void* MIGraphXStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case RocmResource::hip_stream_t:
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
index cdf1075beb827..91d85efd09c65 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.cc
@@ -228,7 +228,7 @@ const NnApi LoadNnApi() {
   nnapi.ASharedMemory_create = getASharedMemory_create();
 #else
   // Mock ASharedMemory_create only if libneuralnetworks.so was successfully
-  // loaded. This ensures identical behaviour on platforms which use this
+  // loaded. This ensures identical behavior on platforms which use this
   // implementation, but don't have libneuralnetworks.so library, and
   // platforms which use nnapi_implementation_disabled.cc stub.
   if (libneuralnetworks != nullptr) {
diff --git a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
index 64d8f235840bc..44b34f4b4ce6c 100644
--- a/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
+++ b/onnxruntime/core/providers/rknpu/rknpu_execution_provider.cc
@@ -28,7 +28,7 @@ constexpr const char* RKNPU = "Rknpu";
 struct RknpuFuncState {
   std::string uniq_input_shape;
 
-  std::unique_ptr<rk::nn::Exection> exector;
+  std::unique_ptr<rk::nn::Execution> exector;
   ONNX_NAMESPACE::ModelProto model_proto;
   std::unordered_map<std::string, int> input_map;
   std::unordered_map<std::string, int> output_map;
@@ -282,7 +282,7 @@ common::Status RknpuExecutionProvider::Compile(const std::vector<FusedNodeAndGra
       std::unique_ptr<RknpuFuncState> p =
           std::make_unique<RknpuFuncState>();
       rk::nn::Graph* graph = new rk::nn::Graph();
-      *p = {"", std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(graph)),
+      *p = {"", std::unique_ptr<rk::nn::Execution>(new rk::nn::Execution(graph)),
             model_proto_[context->node_name], input_info_[context->node_name],
             output_info_[context->node_name],
             std::vector<int>{}, std::vector<int>{}};
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index a2b587a56466f..d7f47d07a8fec 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -12,7 +12,7 @@ namespace onnxruntime {
 namespace rocm {
 
 // Op Set 11 for Conv only update document to clearify default dilations and strides value.
-// which are already convered by op set 11 cpu versoin, so simply add declaration.
+// which are already convered by op set 11 cpu version, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T)                                                           \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
index 820745b22f614..11073ab3584eb 100644
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -226,7 +226,7 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
   MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(miopen_handle, reduce_desc, input_tensor, output_tensor, &indices_bytes));
   auto indices_rocm = GetScratchBuffer<uint32_t>(indices_bytes, stream);
 
-  // need to allocate a separate buffer for ArgMin/ArgMax comparsion output
+  // need to allocate a separate buffer for ArgMin/ArgMax comparison output
   auto output_count = output_shape.Size();
 
   if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) {
diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.cc b/onnxruntime/core/providers/rocm/rocm_allocator.cc
index 8645b791d4b0f..4a11b158c2cce 100644
--- a/onnxruntime/core/providers/rocm/rocm_allocator.cc
+++ b/onnxruntime/core/providers/rocm/rocm_allocator.cc
@@ -60,7 +60,7 @@ void* ROCMExternalAllocator::Alloc(size_t size) {
   if (size > 0) {
     p = alloc_(size);
 
-    // review(codemzs): ORT_ENFORCE does not seem appropiate.
+    // review(codemzs): ORT_ENFORCE does not seem appropriate.
     ORT_ENFORCE(p != nullptr);
   }
 
diff --git a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
index 0c0f64a8bfaf0..ef5689fc9a2d0 100644
--- a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
+++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc
@@ -140,7 +140,7 @@ Status RocmStream::CleanUpOnRunEnd() {
 }
 
 void* RocmStream::GetResource(int version, int id) const {
-  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
+  ORT_ENFORCE(version <= ORT_ROCM_RESOURCE_VERSION, "resource version unsupported!");
   void* resource{};
   switch (id) {
     case RocmResource::hip_stream_t:
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index e839d6d17b7d9..0da0dfc6dfb26 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -329,7 +329,7 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
     node_compute_funcs.push_back(compute_info);
   }
 
-  // Explictly release the WebNN builder to free memory.
+  // Explicitly release the WebNN builder to free memory.
   wnn_builder_ = emscripten::val::undefined();
 
   return Status::OK();
diff --git a/onnxruntime/core/session/IOBinding.cc b/onnxruntime/core/session/IOBinding.cc
index 583206ecafcdc..aa10c867fe960 100644
--- a/onnxruntime/core/session/IOBinding.cc
+++ b/onnxruntime/core/session/IOBinding.cc
@@ -29,7 +29,7 @@ common::Status IOBinding::BindInput(const std::string& name, const OrtValue& ml_
     // It may copy the data instead of copying the pointer.
     // When OrtValue is empty, the pointer is copied. When it is not
     // (if feeds_[index] is not for example),
-    // CopyOneInputAcrossDevices has a different behaviour.
+    // CopyOneInputAcrossDevices has a different behavior.
     ORT_RETURN_IF_ERROR(utils::CopyOneInputAcrossDevices(session_state_, name, ml_value, new_mlvalue));
     add_or_replace(new_mlvalue);
   } else {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 3fd6e84e0e5ce..cc3a9943ca0a3 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2827,7 +2827,7 @@ std::pair<common::Status, const InputDefList*> InferenceSession::GetOverridableI
     }
   }
 
-  // returns a list of initializers that can be overriden.
+  // returns a list of initializers that can be overridden.
   return std::make_pair(common::Status::OK(), &model_->MainGraph().GetOverridableInitializers());
 }
 
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index e1cd085d2c271..9662095bf0ed3 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -386,7 +386,7 @@ class InferenceSession {
    * @param run_options run options.
    * @param mutable_feeds inputs owned by client code and will be released as long as the feeds be set in session states.
    * Then the feeds will purely managed in the session states.
-   * @param fetches outputs produced after the executin of this function.
+   * @param fetches outputs produced after the execution of this function.
    * @param state State of the graph needed to resume partial graph run.
    * @param feeds_fetches_manager Contains feed/fetches name to internal indices mapping and information for device
    *                              copy/checks.
diff --git a/onnxruntime/core/util/qmath.h b/onnxruntime/core/util/qmath.h
index c982a7aa2e7e0..1b2180da95058 100644
--- a/onnxruntime/core/util/qmath.h
+++ b/onnxruntime/core/util/qmath.h
@@ -552,7 +552,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
                             std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
                             const std::ptrdiff_t thread_block_size, bool saturate) {
     ORT_UNUSED_PARAMETER(saturate);
-    // to avoid a byte being writen from mutiple threads, use 2 * N as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * N as thread block
     ORT_UNUSED_PARAMETER(thread_block_size);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
@@ -637,7 +637,7 @@ struct BlockedQuantizeLinear<float, TOut, 2> {
     ORT_UNUSED_PARAMETER(saturate);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
-    // to avoid a byte being writen from mutiple threads, use 2 * K as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * K as thread block
     auto size_thread_block = 2 * K;
     auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
     auto num_thread_block = (M + 1) / 2;
@@ -697,7 +697,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
                             std::ptrdiff_t N, const std::ptrdiff_t quant_block_size,
                             const std::ptrdiff_t thread_block_size, bool saturate) {
     ORT_UNUSED_PARAMETER(saturate);
-    // to avoid a byte being writen from mutiple threads, use 2 * N as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * N as thread block
     ORT_UNUSED_PARAMETER(thread_block_size);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
@@ -786,7 +786,7 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 2> {
     ORT_UNUSED_PARAMETER(saturate);
     constexpr auto low = static_cast<int32_t>(TOut::min_val);
     constexpr auto high = static_cast<int32_t>(TOut::max_val);
-    // to avoid a byte being writen from mutiple threads, use 2 * K as thread block
+    // to avoid a byte being written from mutiple threads, use 2 * K as thread block
     auto size_thread_block = 2 * K;
     auto quant_block_num_K = (K + quant_block_size - 1) / quant_block_size;
     auto num_thread_block = (M + 1) / 2;
diff --git a/onnxruntime/python/onnxruntime_pybind_schema.cc b/onnxruntime/python/onnxruntime_pybind_schema.cc
index 218b59688b01c..c5757095e2e1e 100644
--- a/onnxruntime/python/onnxruntime_pybind_schema.cc
+++ b/onnxruntime/python/onnxruntime_pybind_schema.cc
@@ -15,7 +15,7 @@ void addGlobalSchemaFunctions(pybind11::module& m) {
       "get_all_operator_schema", []() -> const std::vector<ONNX_NAMESPACE::OpSchema> {
         return ONNX_NAMESPACE::OpSchemaRegistry::get_all_schemas_with_history();
       },
-      "Return a vector of OpSchema all registed operators");
+      "Return a vector of OpSchema all registered operators");
   m.def(
       "get_all_opkernel_def", []() -> const std::vector<onnxruntime::KernelDef> {
         std::vector<onnxruntime::KernelDef> result;
diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
index db0b2e392b29f..7dcead113ac4f 100644
--- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
+++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
@@ -41,7 +41,7 @@ struct MakeDType {
 
 /// <summary>
 /// The function creates a numpy array that points to
-/// data stored within the corresponing tensor. Parent object
+/// data stored within the corresponding tensor. Parent object
 /// holds a reference to the object that owns the data so it
 /// does not disappear.
 /// </summary>
@@ -396,7 +396,7 @@ void addSparseTensorMethods(pybind11::module& m) {
       })
       // pybind apparently has a bug with returning enums from def_property_readonly or methods
       // returning a method object instead of the enumeration value
-      // so we are using def_property and throw on a potential modificaiton
+      // so we are using def_property and throw on a potential modification
       .def_property(
           "format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat {
         const SparseTensor& tensor = py_tensor->Instance();
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e13285c60e69f..d7155b2b6899a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -152,7 +152,7 @@ void AsyncCallback(void* user_data, OrtValue** outputs, size_t num_outputs, OrtS
   } else {
     // acquire GIL to safely:
     // 1) invoke python callback
-    // 2) create, manipulate, and destory python objects
+    // 2) create, manipulate, and destroy python objects
     py::gil_scoped_acquire acquire;
     invoke_callback();
   }
@@ -946,7 +946,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
                                                                             provider_options_map);
 
         // This variable is never initialized because the APIs by which it should be initialized are deprecated,
-        // however they still exist are are in-use. Neverthless, it is used to return CUDAAllocator,
+        // however they still exist are are in-use. Nevertheless, it is used to return CUDAAllocator,
         // hence we must try to initialize it here if we can since FromProviderOptions might contain
         // external CUDA allocator.
         external_allocator_info = info.external_allocator_info;
@@ -973,14 +973,17 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       const ROCMExecutionProviderInfo info = GetRocmExecutionProviderInfo(rocm_provider_info,
                                                                           provider_options_map);
 
-      // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
-      // exist are are in-use. Neverthless, it is used to return ROCMAllocator, hence we must try to initialize it here if we can
-      // since FromProviderOptions might contain external ROCM allocator.
+      // This variable is never initialized because the APIs by which is it should be initialized are deprecated,
+      // however they still exist and are in-use. Nevertheless, it is used to return ROCMAllocator, hence we must
+      // try to initialize it here if we can since FromProviderOptions might contain external ROCM allocator.
       external_allocator_info = info.external_allocator_info;
       return rocm_provider_info->CreateExecutionProviderFactory(info)->CreateProvider();
     } else {
       if (!Env::Default().GetEnvironmentVar("ROCM_PATH").empty()) {
-        ORT_THROW("ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
+        ORT_THROW(
+            "ROCM_PATH is set but ROCM wasn't able to be loaded. Please install the correct version "
+            "of ROCM and MIOpen as mentioned in the GPU requirements page, make sure they're in the PATH, "
+            "and that your GPU is supported.");
       }
     }
 #endif
@@ -1389,7 +1392,8 @@ void addGlobalMethods(py::module& m) {
         LogDeprecationWarning("set_openvino_device", "OpenVINO execution provider option \"device_type\"");
         openvino_device_type = device_type;
       },
-      "Set the prefered OpenVINO device type to be used. If left unset, the device type selected during build time will be used.");
+      "Set the preferred OpenVINO device type to be used. If left unset, "
+      "the device type selected during build time will be used.");
   // TODO remove deprecated global config
   m.def(
       "get_openvino_device", []() -> std::string {
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 10492ae419817..65875d09102bd 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -812,7 +812,7 @@ def collect_absolute_value(self, name_to_arr):
                 hist_edges = hist_edges.astype(data_arr_np.dtype)
                 assert (
                     data_arr_np.dtype != np.float64
-                ), "only float32 or float16 is supported, every constant must be explicetly typed"
+                ), "only float32 or float16 is supported, every constant must be explicitly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
             else:
                 old_histogram = self.histogram_dict[tensor]
@@ -834,7 +834,7 @@ def collect_absolute_value(self, name_to_arr):
                 hist[: len(old_hist)] += old_hist
                 assert (
                     data_arr_np.dtype != np.float64
-                ), "only float32 or float16 is supported, every constant must be explicetly typed"
+                ), "only float32 or float16 is supported, every constant must be explicitly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
 
     def collect_value(self, name_to_arr):
diff --git a/onnxruntime/python/tools/quantization/operators/direct_q8.py b/onnxruntime/python/tools/quantization/operators/direct_q8.py
index ae9679ae8ec7a..de610a4c01326 100644
--- a/onnxruntime/python/tools/quantization/operators/direct_q8.py
+++ b/onnxruntime/python/tools/quantization/operators/direct_q8.py
@@ -13,7 +13,7 @@ def quantize(self):
         node = self.node
 
         if not self.quantizer.force_quantize_no_input_check:
-            # Keep backward compatiblity
+            # Keep backward compatibility
             # Quantize when input[0] is quantized already. Otherwise keep it.
             quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
             if quantized_input_value is None:
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index e4a9b867b1482..0fdef4ef6f6d3 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -357,7 +357,7 @@ def quantize_data(
     - when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
         `m = max(abs(rmin), abs(rmax))`
 
-    and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
+    and add necessary intermediate nodes to transform quantized weight to full weight using the equation
 
     :math:`r = S(q-z)`, where
 
diff --git a/onnxruntime/python/tools/transformers/README.md b/onnxruntime/python/tools/transformers/README.md
index 547d1a883c165..4f147219f19f1 100644
--- a/onnxruntime/python/tools/transformers/README.md
+++ b/onnxruntime/python/tools/transformers/README.md
@@ -29,7 +29,7 @@ Models not in the list may only be partially optimized or not optimized at all.
 - **hidden_size**: (*default: 768*)
     BERT-base and BERT-large has 768 and 1024 hidden nodes respectively.
 - **input_int32**: (*optional*)
-    Exported model ususally uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
+    Exported model usually uses int64 tensor as input. If this flag is specified, int32 tensors will be used as input, and it could avoid un-necessary Cast nodes and get better performance.
 - **float16**: (*optional*)
     By default, model uses float32 in computation. If this flag is specified, half-precision float will be used. This option is recommended for NVidia GPU with Tensor Core like V100 and T4. For older GPUs, float32 is likely faster.
 -  **use_gpu**: (*optional*)
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 9baafbbfff0e3..5ec2ab4e50799 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -930,7 +930,7 @@ def main():
 
     if len(results) == 0:
         if args.batch_sizes != [0]:
-            logger.warning("No any result avaiable.")
+            logger.warning("No any result available.")
         return
 
     csv_filename = args.detail_csv or f"benchmark_detail_{time_stamp}.csv"
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 2083419087a69..0eaccc0fafcc4 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -368,7 +368,7 @@ def parse_arguments():
         required=False,
         type=str,
         default=None,
-        help=("cache directy of huggingface, by setting this to avoid useless downloading if you have one"),
+        help=("cache directly of huggingface, by setting this to avoid useless downloading if you have one"),
     )
     parser.add_argument(
         "--with_past",
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index 6d6a057574a17..7e786fce30985 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -193,7 +193,7 @@ def main(args):
     config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir)
     model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir)
 
-    # This scirpt does not support float16 for PyTorch.
+    # This script does not support float16 for PyTorch.
     # if args.float16:
     #    model.half()
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 27e3899c11b7a..0ab26308295a9 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -105,7 +105,7 @@ def parse_arguments(argv=None):
         required=False,
         type=float,
         default=0,
-        help="the aboslute and relative tolerance for parity verification",
+        help="the absolute and relative tolerance for parity verification",
     )
 
     parser.add_argument(
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index f4705bef6a988..6bfcb0368eaaa 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -137,7 +137,7 @@ def __init__(
         self.has_position_ids = position_ids is not None
         self.has_attention_mask = attention_mask is not None
 
-        # Emtpy past state for first inference
+        # Empty past state for first inference
         self.past = []
         past_shape = [
             2,
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 43c31e1ea45ac..7295ae1436c99 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -1665,7 +1665,7 @@
     "### Packing Mode (Effective Transformer)\n",
     "\n",
     "When padding ratio is high, it is helpful to use packing mode, also known as [effective transformer](https://github.com/bytedance/effective_transformer).\n",
-    "This feature requires onnxruntime-gpu verison 1.16 or later. \n",
+    "This feature requires onnxruntime-gpu version 1.16 or later. \n",
     "\n",
     "In below example, average sequence length after removing paddings is 32, the sequence length with paddings is 128. We can see 3x throughput with packing mode (QPS increased from 1617 to 5652)."
    ]
diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py
index 05a27ba487f4d..5df765033578b 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_phi.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py
@@ -65,7 +65,7 @@ def __call__(self, x):
         return x
 
 
-# TODO: move to a seperate file
+# TODO: move to a separate file
 class Fission(Fusion):
     def __init__(
         self,
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index 98235de6ba6fd..f5a47b19d67fc 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -17,7 +17,7 @@
 class FusionTnlrAttention(FusionAttention):
     """
     Fuse TNLR Attention subgraph into one Attention node.
-    TNLR Attention has extra addtion after qk nodes and adopts [S, B, NH] as I/O shape.
+    TNLR Attention has extra addition after qk nodes and adopts [S, B, NH] as I/O shape.
     """
 
     def __init__(
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 5f161674b614e..06264b426d1e5 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -531,7 +531,7 @@ def _parse_arguments():
         "--disable_symbolic_shape_infer",
         required=False,
         action="store_true",
-        help="diable symoblic shape inference",
+        help="diable symbolic shape inference",
     )
     parser.set_defaults(disable_symbolic_shape_infer=False)
 
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index 503930b23229f..17fd54f19baf2 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-# This tool is not used directly in bert optimization. It could assist developing the optimization script on the following senarios:
+# This tool is not used directly in bert optimization. It could assist developing the optimization script on the following scenarios:
 # (1) It could simplify graph by removing many sub-graphs related to reshape.
 # (2) It could reduce extra inputs and outputs to fit other tools. The script compare_bert_results.py or bert_perf_test.py requires 3 inputs.
 
diff --git a/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc b/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
index e78b3528c11a4..0634f545e6f7b 100644
--- a/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_lstm_op_test.cc
@@ -266,8 +266,8 @@ static const std::vector<float> s_M_2batch{0.1f, -0.25f, 1.0f, 1.0f, -1.0f, -1.5
                                            0.1f, -0.25f, 0.5f, -0.25f, -1.25f, 0.25f, -1.0f, 1.5f, -1.25f};
 
 // real seq lens for memory
-static std::vector<int> s_mem_seq_lenghts{3};
-static const std::vector<int> s_mem_seq_lenghts_2batch{3, 2};
+static std::vector<int> s_mem_seq_lengths{3};
+static const std::vector<int> s_mem_seq_lengths_2batch{3, 2};
 
 // [batch_size=1, input_max_step=3, input_only_depth=3]
 static std::vector<float> s_X_T_data{
@@ -352,7 +352,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAMZeroAttention) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &zero_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &zero_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &s_seq_lengths,
@@ -389,7 +389,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAM) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &s_seq_lengths,
@@ -428,7 +428,7 @@ TEST(AttnLSTMTest, ForwardLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -467,7 +467,7 @@ TEST(AttnLSTMTest, ReverseLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, W_data, R_data, Y_data, Y_h_data, Y_c_data,
-      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lenghts, &s_attn_layer_weight,
+      s_memory_layer_weight, s_query_layer_weight, s_attn_v, s_M_data, &s_mem_seq_lengths, &s_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -521,7 +521,7 @@ TEST(AttnLSTMTest, BidirectionLstmWithBahdanauAMShortenSeqLength) {
 
   RunAttnLstmTest(
       X_data, d_W_data, d_R_data, Y_data, Y_h_data, Y_c_data,
-      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_data, &s_mem_seq_lenghts, &d_attn_layer_weight,
+      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_data, &s_mem_seq_lengths, &d_attn_layer_weight,
       input_only_depth, batch_size, cell_hidden_size, input_max_step,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &d_B_data, nullptr, nullptr, nullptr, &shortenSeqLen,
@@ -578,7 +578,7 @@ TEST(AttnLSTMTest, BidirectionLstmWithBahdanauAM2BatchShortenSeqLen) {
 
   RunAttnLstmTest(
       X_data, d_W_data, d_R_data, Y_data, Y_h_data, Y_c_data,
-      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_2batch, &s_mem_seq_lenghts_2batch, &d_attn_layer_weight,
+      d_memory_layer_weight, d_query_layer_weight, d_attn_v, s_M_2batch, &s_mem_seq_lengths_2batch, &d_attn_layer_weight,
       input_only_depth, batch2Size, cell_hidden_size, inputMaxStep4,
       memory_max_step, memory_depth, am_attn_size, aw_attn_size,
       &d_B_data, nullptr, nullptr, nullptr, &s_seq_lengths_2batch,
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index bf15a9d35b56a..26e40b25930c8 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1288,7 +1288,7 @@ TEST_F(PlannerTest, MultiStream) {
 
   CreatePlan({}, false);
 
-  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams for CPU and CUDA seperately";
+  EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams for CPU and CUDA separately";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 6) << "CPU stream has 6 steps";
   EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[0]).name(), "LaunchKernelStep"), nullptr) << "0th step: LaunchKernelStep for node 1";
   EXPECT_NE(strstr(typeid(*GetState().GetExecutionPlan()->execution_plan[0]->steps_[1]).name(), "LaunchKernelStep"), nullptr) << "1st step: LaunchKernelStep for node 2";
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 84389c1d9711c..8b230db351edc 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1400,7 +1400,7 @@ TEST(ExecutionProviderTest, OpKernelInfoCanReadConfigOptions) {
   so.session_logid = "ExecutionProviderTest.OpKernelInfoCanReadConfigOptions";
 
   // add a config key that if read causes the Fuse op kernel to throw in the ctor. this is just to test the value is passed
-  // through in the simplest way, as the kernel is constructed in InferenceSession::Intialize so we don't need to
+  // through in the simplest way, as the kernel is constructed in InferenceSession::Initialize so we don't need to
   // actually run the model.
   ASSERT_STATUS_OK(so.config_options.AddConfigEntry("ThrowInKernelCtor", "1"));
 
diff --git a/onnxruntime/test/framework/tunable_op_test.cc b/onnxruntime/test/framework/tunable_op_test.cc
index 6fe0754db40d3..53aa949647c77 100644
--- a/onnxruntime/test/framework/tunable_op_test.cc
+++ b/onnxruntime/test/framework/tunable_op_test.cc
@@ -668,7 +668,7 @@ TEST(TuningContext, TunableOpRespectTuningContext) {
     ASSERT_TRUE(status.IsOK());
     ASSERT_EQ(last_run, "FastFull");
 
-    // After TunableOp(...), the result entry is corretly written.
+    // After TunableOp(...), the result entry is correctly written.
     ASSERT_EQ(mgr.Lookup(op.Signature()).size(), 1u);
     ASSERT_EQ(mgr.Lookup(op.Signature(), params.Signature()), tuning::TunableVecAddSelectFast::kFastFullId);
   }
diff --git a/onnxruntime/test/fuzzing/include/BetaDistribution.h b/onnxruntime/test/fuzzing/include/BetaDistribution.h
index c5c59922d864c..40e42a598c85a 100644
--- a/onnxruntime/test/fuzzing/include/BetaDistribution.h
+++ b/onnxruntime/test/fuzzing/include/BetaDistribution.h
@@ -83,7 +83,7 @@ class BetaDistribution {
       calc_type highest_probability_temp = highest_probability;
       highest_probability = std::max({highest_probability_temp, distribution(sample)});
 
-      // A new sample number with a higher probabilty has been found
+      // A new sample number with a higher probability has been found
       //
       if (highest_probability > highest_probability_temp) {
         likely_number = sample;
@@ -137,7 +137,7 @@ class BetaDistribution {
     }
   }
 
-  // Generate the probabilty of having this number
+  // Generate the probability of having this number
   //
   inline calc_type distribution(calc_type randVar) {
     if (randVar > max() || randVar < min()) {
diff --git a/onnxruntime/test/fuzzing/src/test.cpp b/onnxruntime/test/fuzzing/src/test.cpp
index 0d51af6b6b0fa..490f7dd4d37a3 100644
--- a/onnxruntime/test/fuzzing/src/test.cpp
+++ b/onnxruntime/test/fuzzing/src/test.cpp
@@ -365,7 +365,7 @@ int main(int argc, char* argv[]) {
       std::ifstream ortModelStream(ort_model_file, std::ifstream::in | std::ifstream::binary);
       ortModelStream.read(model_data.data(), num_bytes);
       ortModelStream.close();
-      // Currently mutations are generated by using XOR of a byte with the preceeding byte at a time.
+      // Currently mutations are generated by using XOR of a byte with the preceding byte at a time.
       // Other possible ways may be considered in future, for example swaping two bytes randomly at a time.
       Logger::testLog << "Starting Test" << Logger::endl;
       for (size_t& i = run_stats.iteration; i < num_bytes - 1; i++) {
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 5fc036790b765..f6b7bdb1a001c 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -464,7 +464,7 @@ TEST_F(GraphTest, LocalCustomRegistry) {
 
 // Tests the case where function op and function body ops belong to different domains.
 // Tests that such a model can be loaded successfully, function body initialization is
-// successful and domain and verison mapping for each node is successful (by verifying
+// successful and domain and version mapping for each node is successful (by verifying
 // op schema for each of the function body nodes can be found).
 TEST_F(GraphTest, FunctionOpsetImportTest) {
   std::shared_ptr<Model> model;
@@ -481,7 +481,7 @@ TEST_F(GraphTest, FunctionOpsetImportTest) {
       // phase .i.e. Init function body only if none of EPs have a kernel matching the function op
       // then this check will not hold true and should be removed.
 
-      // We delay the funciton instantiate untill partition the graph
+      // We delay the function instantiate until partition the graph
       // this check is no longer valid anymore.
       /*ASSERT_TRUE(!schema->HasFunction() && !schema->HasContextDependentFunction());*/
       continue;
diff --git a/onnxruntime/test/ir/schema_registry_manager_test.cc b/onnxruntime/test/ir/schema_registry_manager_test.cc
index 704c84343173a..52c286d187e53 100644
--- a/onnxruntime/test/ir/schema_registry_manager_test.cc
+++ b/onnxruntime/test/ir/schema_registry_manager_test.cc
@@ -89,7 +89,7 @@ TEST(SchemaRegistryManager, OpsetRegTest) {
   // registry2 has:(op1,domain1,version2)
   ASSERT_TRUE(registry2->GetSchema("Op1", 1, "Domain1") == nullptr);
   ASSERT_TRUE(registry2->GetSchema("Op1", 2, "Domain1") != nullptr);
-  // Fail because this registery doesn't have the information of opset3
+  // Fail because this registry doesn't have the information of opset3
   ASSERT_TRUE(registry2->GetSchema("Op1", 3, "Domain1") == nullptr);
 
   std::shared_ptr<onnxruntime::OnnxRuntimeOpSchemaRegistry> registry3 = std::make_shared<OnnxRuntimeOpSchemaRegistry>();
@@ -126,7 +126,7 @@ TEST(SchemaRegistryManager, OpsetRegTest) {
   // Note that "Op5" has SinceVersion equal to 1, but a V1 operator set was already registered
   // without this operator.  This would normally be invalid, and the registry with the missing
   // operator could trigger the operator lookup to fail.  Version 1 is a special case to allow
-  // for experimental operators, and is accomplished by not reducing the targetted version to
+  // for experimental operators, and is accomplished by not reducing the targeted version to
   // zero in OnnxRuntimeOpSchemaRegistry::GetSchemaAndHistory.
   // TODO - Consider making the registration algorithm robust to this invalid usage in general
   ASSERT_TRUE(manager.GetSchema("Op5", 5, "Domain1")->since_version() == 1);
diff --git a/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
index 05c6a0098eecb..53b3edafdf84f 100644
--- a/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_fgemm_fixture.h
@@ -9,7 +9,7 @@
 #include <sstream>
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename T, bool Packed, bool Threaded>
 class FgemmShortExecuteTest : public MlasTestFixture<MlasFgemmTest<T, Packed, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
index 2a478675d09eb..aafdcc14c0028 100644
--- a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
@@ -17,7 +17,7 @@ Module Name:
 #include "test_halfgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, bool Packed, bool Threaded>
 class HalfGemmShortExecuteTest : public MlasTestFixture<MlasHalfGemmTest<AType, BType, Packed, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h b/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
index 2ede8c3f0ab11..cb748bbaccce0 100644
--- a/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_pool2d_fixture.h
@@ -7,7 +7,7 @@
 #include "test_pool2d.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename Pool2DTester>
 class Pooling2dShortExecuteTest : public MlasTestFixture<Pool2DTester> {
diff --git a/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h b/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
index 00f95bb00b9ae..e3d2aebc39cec 100644
--- a/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_pool3d_fixture.h
@@ -7,7 +7,7 @@
 #include "test_pool3d.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <MLAS_POOLING_KIND PoolingKind, bool Threaded>
 class Pooling3dShortExecuteTest : public MlasTestFixture<MlasPool3DTest<PoolingKind, Threaded>> {
diff --git a/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
index b2657fbde9afa..40f688a16ecca 100644
--- a/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_qgemm_fixture.h
@@ -7,7 +7,7 @@
 #include "test_qgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, typename OutputType, bool Packed, bool Threaded>
 class QgemmShortExecuteTest;
diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
index 941de8f05061f..f85fe97776dc1 100644
--- a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp
@@ -20,7 +20,7 @@ Module Name:
 #include "test_sbgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename BType, bool Packed, bool Threaded>
 class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BType, Packed, Threaded>> {
@@ -76,7 +76,7 @@ class SBGemmShortExecuteTest : public MlasTestFixture<MlasSBGemmTest<AType, BTyp
         test_registered += RegisterSingleTest(1, 32, b, 5, false);
       }
     }
-    // TODO: check why the cosine similary is < 0.99 for this shape alone
+    // TODO: check why the cosine similarly is < 0.99 for this shape alone
     // test_registered += RegisterSingleTest(43, 500, 401, 1, true);
     test_registered += RegisterSingleTest(1001, 1027, 1031, 1, false);
     if (!Packed) {
diff --git a/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h b/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
index 9b1a3a7502723..71c022211d5d4 100644
--- a/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
+++ b/onnxruntime/test/mlas/unittest/test_symm_qgemm_fixture.h
@@ -7,7 +7,7 @@
 #include "test_symm_qgemm.h"
 
 //
-// Short Execute() test helper to register each test seperately by all parameters.
+// Short Execute() test helper to register each test separately by all parameters.
 //
 template <typename AType, typename OutputType, bool Threaded>
 class SymmQgemmShortExecuteTest;
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index ea2823916798e..5ecbf4967b044 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -4883,7 +4883,7 @@ static void CheckSharedInitializerHandling(bool broadcast) {
 
 // test we re-use a modified shared initializer wherever possible. model has one initializer that is used by 3 DQ nodes
 // and one initializer that is used by 2 Add nodes. both cases should be handled with the initializer being
-// modified in-place for the first usage, and the Transpose added to the second usage being cancelled out when the
+// modified in-place for the first usage, and the Transpose added to the second usage being canceled out when the
 // original Transpose at the start of the model is pushed down.
 TEST(TransposeOptimizerTests, SharedInitializerHandling) {
   CheckSharedInitializerHandling(/*broadcast*/ false);
@@ -4899,7 +4899,7 @@ TEST(TransposeOptimizerTests, SharedInitializerHandlingBroadcast) {
 }
 
 // Unit test where EstimateTransposeValueCost must look past a DQ -> Squeeze to see the Transponse of a shared
-// initializer for the overall cost of pushing the Transpose throught the second Where to be negative.
+// initializer for the overall cost of pushing the Transpose through the second Where to be negative.
 TEST(TransposeOptimizerTests, SharedInitializerHandlingBroadcast2) {
   auto model_uri = ORT_TSTR("testdata/transpose_optimizer_shared_initializers_broadcast2.onnx");
 
diff --git a/onnxruntime/test/perftest/ReadMe.txt b/onnxruntime/test/perftest/ReadMe.txt
index 4142beefbd034..9c0dbf5d673e7 100644
--- a/onnxruntime/test/perftest/ReadMe.txt
+++ b/onnxruntime/test/perftest/ReadMe.txt
@@ -10,7 +10,7 @@ Options:
         -h: help
 
 Model path and input data dependency:
-    Performance test uses the same input structure as onnx_test_runner. It requrires the direcotry trees as below: 
+    Performance test uses the same input structure as onnx_test_runner. It requrires the directory trees as below: 
     
     --ModelName
         --test_data_set_0
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 92d732fba2a0a..0e4f0d0cad3f4 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -941,7 +941,7 @@ bool OnnxRuntimeTestSession::PopulateGeneratedInputTestData(int32_t seed) {
       auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
       std::vector<int64_t> input_node_dim = tensor_info.GetShape();
 
-      // free dimensions are treated as 1 if not overriden
+      // free dimensions are treated as 1 if not overridden
       for (int64_t& dim : input_node_dim) {
         if (dim == -1) {
           dim = 1;
diff --git a/onnxruntime/test/platform/android/cxa_demangle_test.cc b/onnxruntime/test/platform/android/cxa_demangle_test.cc
index 47f149c4d3a22..dbb050ce623f4 100644
--- a/onnxruntime/test/platform/android/cxa_demangle_test.cc
+++ b/onnxruntime/test/platform/android/cxa_demangle_test.cc
@@ -27,7 +27,7 @@ TEST(DummyCxaDemangleTest, Alloc) {
   ASSERT_STREQ(output_buffer, input);
   std::free(output_buffer);
 
-  // verify status can be omited
+  // verify status can be omitted
   char* output_buffer2 = __cxa_demangle(input, nullptr, nullptr, nullptr);
   ASSERT_STREQ(output_buffer2, input);
   std::free(output_buffer2);
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index e5f3956438b7a..6bf2fc63ab165 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -155,7 +155,7 @@ static common::Status CreateSubgraph(Graph& graph, RunOptions& options, const st
     graph.AddNode("add", "Add", "Add 1 to the loop state", inputs, outputs);
   }
 
-  // subgraph with multiple inputs and outputs to test variadic behaviour.
+  // subgraph with multiple inputs and outputs to test variadic behavior.
   // 2 inputs of 2 that are concatenated and then split into 4 outputs of 1
 
   // Concat node
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index e73a1b492cc05..3b7e93b8f7668 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -284,7 +284,7 @@ TEST(LSTMTest, MixedSequenceLengths) {
   }
 
   // we don't have numpy output for this, but by testing twice and swapping which batch is smaller
-  // we can largely verify the behaviour by comparing to ForwardSimpleWeightsNoBiasTwoRows output.
+  // we can largely verify the behavior by comparing to ForwardSimpleWeightsNoBiasTwoRows output.
   std::vector<int> seq_lengths{1, 2};
 
   std::vector<float> Y_data{
@@ -333,7 +333,7 @@ TEST(LSTMTest, MixedSequenceLengthsReverse) {
   }
 
   // we don't have numpy output for this, but by testing twice and swapping which batch is smaller
-  // we can largely verify the behaviour by comparing to ReverseSimpleWeightsNoBiasTwoRows output.
+  // we can largely verify the behavior by comparing to ReverseSimpleWeightsNoBiasTwoRows output.
   std::vector<int> seq_lengths{1, 2};
 
   std::vector<float> Y_data{
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index 27a0696acb599..b413d04fe81e8 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -14,7 +14,7 @@ namespace test {
 TEST(AllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
-  // ensure CUDA device is avaliable.
+  // ensure CUDA device is available.
   CUDA_CALL_THROW(cudaSetDevice(cuda_device_id));
 
   AllocatorCreationInfo default_memory_info(
diff --git a/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc b/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
index a13fa91366aaf..1274efedbeb61 100644
--- a/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
+++ b/onnxruntime/test/providers/dnnl/transformer/matmul_post_op_transform_test.cc
@@ -14,10 +14,10 @@
  * The tests validate that if a fusion occures the expected output matches
  * the output of each graph if they had not be done separatly.
  *
- * Unfortantly there is no hook to actually check that the fussion occured
+ * Unfortantly there is no hook to actually check that the fussion occurred
  * other than inspecting debug logs.
  *
- * The 8 tests use patterns that we have seen in actual models durring testing.
+ * The 8 tests use patterns that we have seen in actual models during testing.
  * Other tests validate that non-associative ops work as expected. We are able
  * to fuse the output of matmul divided by another value but we can not fuse
  * the a value divided by the output of matmul. Similar with Subtraction.
@@ -673,7 +673,7 @@ TEST(DnnlMatMulFusion, matmul_div_sub_1) {
 // in the matmul post op fusion to check that the 32 post op
 // limit is not exceded.
 // to do this we just run the matmul->[add->mul->sub-div] 9 times
-// input params are shared accross multiple ops
+// input params are shared across multiple ops
 class Dnnl_matmul_36_post_ops_PostOpTester : public OpTester {
  public:
   explicit Dnnl_matmul_36_post_ops_PostOpTester(int opset_version = 7)
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
index 8cf7efe14b1c9..d58db5178032d 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
@@ -83,7 +83,7 @@ TEST(InternalTestingEP, TestSortResultsInSinglePartition) {
 }
 
 // mode has Resize op with optional input roi which is just a placeholder.
-// partition funtion should skip the placeholder inputs.
+// partition function should skip the placeholder inputs.
 TEST(InternalTestingEP, TestResizeWithOptionalInput) {
   // Resize op has optional input roi which is just a placeholder
   const ORTCHAR_T* model_path = ORT_TSTR("testdata/model_resize_empty_optional_input.onnx");
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index 012845f5eb161..a3768cb98f584 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -654,7 +654,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
 // Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
 // Create another Onnx model which also reference to the bin file,
 // but the node name is not same with the QNN graph name inside the bin file.
-// This is to support backward compitable for the models generated before the PR that
+// This is to support backward compatible for the models generated before the PR that
 // make context generation support multi-partition
 TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) {
   ProviderOptions provider_options;
@@ -732,7 +732,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
   ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
 }
 
-// Model has 2 EPContext nodes, both with main_context=1 and embeded context binary
+// Model has 2 EPContext nodes, both with main_context=1 and embedded context binary
 TEST_F(QnnHTPBackendTests, QnnMultiContextEmbeded) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 892e7de8bb6ed..32eac6f7638c1 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1783,7 +1783,7 @@ def test_multiple_devices(self):
                 return
 
             # https://github.com/microsoft/onnxruntime/issues/18432. Make sure device Id is properly set
-            # Scenario 1, 3 sessions created with differnt device Id under IOBinding
+            # Scenario 1, 3 sessions created with different device Id under IOBinding
             sessions = []
             for i in range(3):
                 sessions.append(
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index 6cdf820c8a0e9..8b6b029c57752 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -173,7 +173,7 @@ def test_run_model_mlnet(self):
         # In memory, the size of each element is fixed and equal to the
         # longest element. We cannot use bytes because numpy is trimming
         # every final 0 for strings and bytes before creating the array
-        # (to save space). It does not have this behaviour for void
+        # (to save space). It does not have this behavior for void
         # but as a result, numpy does not know anymore the size
         # of each element, they all have the same size.
         c1 = np.array([b"A\0A\0\0", b"B\0B\0\0", b"C\0C\0\0"], np.void).reshape(1, 3)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
index 22a09ef565d59..fe64aac54951b 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
@@ -54,7 +54,7 @@ def test_run_sparse_output_only(self):
 
     def test_run_contrib_sparse_mat_mul(self):
         """
-        Mutliple sparse COO tensor to dense
+        Multiple sparse COO tensor to dense
         """
         common_shape = [9, 9]  # inputs and oputputs same shape
         A_values = np.array(  # noqa: N806
diff --git a/onnxruntime/test/python/quantization/test_quantize_static_resnet.py b/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
index 1efa283af6881..d105f647c813b 100644
--- a/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
+++ b/onnxruntime/test/python/quantization/test_quantize_static_resnet.py
@@ -87,7 +87,7 @@ def test_quantize_static_resnet(self):
                     # * uint8([128, 128, ..., 127, ...]) if per_channel is True
                     # QLinearConv : zero point of per-channel filter must be same.
                     # That's why the quantization forces a symmetric quantization into INT8.
-                    # zero_point is guaranted to be zero whatever the channel is.
+                    # zero_point is guaranteed to be zero whatever the channel is.
 
                     with open(qdq_file, "rb") as f:
                         onx = onnx.load(f)
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 33ec1bd7728fe..88f870e92d558 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -47,7 +47,7 @@ def setUp(self):
             "Test best way to invest",
             # "The AI community building the future",
             # "The selloff in tech shares deepened",
-            # "Abortion rights take centre stage",
+            # "Abortion rights take center stage",
         ]
         self.enable_cuda = torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers()
         self.remove_onnx_files()
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index eacd41e6b9c6d..52491a179c2ce 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -1620,7 +1620,7 @@ TEST(CApiTest, test_custom_op_openvino_wrapper_library) {
 // It has memory leak. The OrtCustomOpDomain created in custom_op_library.cc:RegisterCustomOps function was not freed
 #if defined(__ANDROID__)
 TEST(CApiTest, DISABLED_test_custom_op_library) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, DISABLED_test_custom_op_library) {
 #else
@@ -1674,7 +1674,7 @@ TestInference<int32_t>(*ort_env, CUSTOM_OP_LIBRARY_TEST_MODEL_URI, inputs, "outp
 // Has memory leak
 #if defined(__ANDROID__) || defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
 #else
@@ -1705,7 +1705,7 @@ TEST(CApiTest, test_custom_op_shape_infer_attr) {
 // It has memory leak. The OrtCustomOpDomain created in custom_op_library.cc:RegisterCustomOps function was not freed
 #if defined(__ANDROID__)
 TEST(CApiTest, test_custom_op_library_copy_variadic) {
-// To accomodate a reduced op build pipeline
+// To accommodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
 TEST(CApiTest, test_custom_op_library_copy_variadic) {
 #else
diff --git a/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb b/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
index f8af2d8a9f6e8..e6118e3b53b1d 100644
--- a/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
+++ b/onnxruntime/test/testdata/transform/model_creation_for_testing.ipynb
@@ -309,7 +309,7 @@
     "        helper.make_node('Slice', ['E', 'startsE', 'endsE', 'axesE', 'stepsE'], ['F']),\n",
     "        # Will be removed.\n",
     "        helper.make_node('Slice', ['F', 'startsF', 'endsF', 'axesF'], ['G']),\n",
-    "        # Will not be removed because of endsG appearing in graph inputs (can be overriden).\n",
+    "        # Will not be removed because of endsG appearing in graph inputs (can be overridden).\n",
     "        helper.make_node('Slice', ['G', 'startsG', 'endsG'], ['H']),\n",
     "        helper.make_node('Max', ['H'], ['I']),\n",
     "        # Will not be removed because node output participates in graph output.\n",
diff --git a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
index c57024538f5b2..306ad7d37403a 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
@@ -7,7 +7,7 @@
 hidden_per_attention = 2
 
 # Self-attention.
-# Handle self-attension.
+# Handle self-attention.
 # MatMul->Add->Split->Reshape->Transpose->MatMul->Div->Mul->Sub->Softmax->Dropout->MatMul->Transpose->Reshape->MatMul->Add
 #                  |->Reshape->Transpose->|                                        |
 #                  |->Reshape->Transpose------------------------------------------>|
diff --git a/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py b/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
index d710c796fb0ad..293c5aafe7f0c 100644
--- a/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
+++ b/onnxruntime/test/testdata/transpose_optimizer_shared_initializers.py
@@ -59,7 +59,7 @@ def create_model_with_Where():  # noqa 'Where' is the operator name
     initializer and other usage. We need to use Where as we require more than 2 inputs.
     The `condition` input will be having a Transpose pushed through it will have a negative cost.
     The `X` input will have a positive cost which cancels out the negative value.
-    The `Y` input will be a shared initializer that is braodcast. If we don't find the Transpose to make the cost of it
+    The `Y` input will be a shared initializer that is broadcast. If we don't find the Transpose to make the cost of it
     negative we will not push the Transpose though.
 
     If we only have 2 inputs, the broadcast initializer will always cost less due to its smaller rank, meaning we don't
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index 2cd1515d191c8..0730559c4375b 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -3,7 +3,7 @@
 
 // NOTE: This file contains declarations of exported functions as WebAssembly API.
 // Unlike a normal C-API, the purpose of this API is to make emcc to generate correct exports for the WebAssembly. The
-// macro "EMSCRIPTEN_KEEPALIVE" helps the compiler to mark the function as an exported funtion of the WebAssembly
+// macro "EMSCRIPTEN_KEEPALIVE" helps the compiler to mark the function as an exported function of the WebAssembly
 // module. Users are expected to consume those functions from JavaScript side.
 
 #pragma once
diff --git a/orttraining/orttraining/core/framework/adasum/adasum_interface.h b/orttraining/orttraining/core/framework/adasum/adasum_interface.h
index e872da78fdcf5..d7dc62336421c 100644
--- a/orttraining/orttraining/core/framework/adasum/adasum_interface.h
+++ b/orttraining/orttraining/core/framework/adasum/adasum_interface.h
@@ -138,7 +138,7 @@ class AdasumInterface {
   //              first n-1 levels are skipped. This is useful when the
   //              communication inside the node is implemented using another
   //              reduce-scatter algorithm, e.g. the one in NCCL, which may be
-  //              desireable on some hardware configurations. When
+  //              desirable on some hardware configurations. When
   //              start_level>1, tensor_counts must be set according to the
   //              slices owned by this rank.
   // communicator: the communicator to reduce with.
diff --git a/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc b/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
index e01456ee3d769..593a8be399bd6 100644
--- a/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
+++ b/orttraining/orttraining/core/framework/ortmodule_graph_builder.cc
@@ -223,7 +223,7 @@ void OrtModuleGraphBuilder::GetFrontierTensors() {
   for (const auto& param : graph_info_.initializer_names_to_train) {
     std::vector<const Node*> consumer_nodes = graph.GetConsumerNodes(param);
     // Initial support is limited to caching Cast output. This can
-    // be extended to accomodate more ops whose result depends only
+    // be extended to accommodate more ops whose result depends only
     // on the weight tensor which is a WIP.
     for (const Node* node : consumer_nodes) {
       if (node != nullptr && node->OpType() == "Cast") {
diff --git a/orttraining/orttraining/core/framework/pipeline.cc b/orttraining/orttraining/core/framework/pipeline.cc
index 3b0a63bb2a71a..3614637ca0987 100644
--- a/orttraining/orttraining/core/framework/pipeline.cc
+++ b/orttraining/orttraining/core/framework/pipeline.cc
@@ -193,7 +193,7 @@ std::vector<int> PipelineScheduler::FindForwardComputeTime(const std::vector<int
       }
 
       if (s > 0 && t <= forward_time.at(s - 1)) {
-        // Foward of the s-th stage must happen after forward of (s-1)-th stage.
+        // Forward of the s-th stage must happen after forward of (s-1)-th stage.
         // Note that forward_time[s] is the time slot of the s-th stage.
         continue;
       }
diff --git a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
index 1bed983cde64d..a4143e7c817fd 100644
--- a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
+++ b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc
@@ -46,7 +46,7 @@ static const std::unordered_map<std::string, std::vector<int>> stage1_fp32_node_
 };
 
 // Currently the list here is same as stage1 above due to empty FP32_Nodes.
-// It's possibile we will have more FP32 nodes added, this map will also be extended.
+// It's possible we will have more FP32 nodes added, this map will also be extended.
 static const std::unordered_map<std::string, std::vector<int>> stage2_fp32_node_args = {
     {"Dropout", {1}},
     {"DropoutGrad", {2}},
diff --git a/orttraining/orttraining/core/graph/optimizer_graph_builder.h b/orttraining/orttraining/core/graph/optimizer_graph_builder.h
index b79bde28c0d9c..d33902379cb5e 100644
--- a/orttraining/orttraining/core/graph/optimizer_graph_builder.h
+++ b/orttraining/orttraining/core/graph/optimizer_graph_builder.h
@@ -125,7 +125,7 @@ class OptimizerGraphBuilder {
       GraphAugmenter::GraphDefs& graph_defs,
       std::unordered_map<std::string, std::unordered_map<std::string, std::string>>& weight_to_opt_mapping);
 
-  // This function can be overriden by child classes to have different logic
+  // This function can be overridden by child classes to have different logic
   // for building optimizers.
   virtual Status BuildOptimizerNode(
       const std::unique_ptr<OptimizerBuilder>& opt_builder,
diff --git a/orttraining/orttraining/core/graph/pipeline_transformer.cc b/orttraining/orttraining/core/graph/pipeline_transformer.cc
index a58cca0acd014..f989d53aa85d5 100644
--- a/orttraining/orttraining/core/graph/pipeline_transformer.cc
+++ b/orttraining/orttraining/core/graph/pipeline_transformer.cc
@@ -446,7 +446,7 @@ void FindPipelineLandmarks(
 //
 // The input graph is a pipeline's stage, which contains some Send's and Recv's.
 //
-// For diferent pipeline stages, they have different communication patterns as
+// For different pipeline stages, they have different communication patterns as
 // shown below.
 //
 //  1. First stage:
@@ -1615,7 +1615,7 @@ Status ApplyPipelinePartitionToMainGraph(Graph& graph,
                                                        send_nodes, recv_nodes,
                                                        stage_to_rank));
 
-  // Take care of weights that are shared accross stages.
+  // Take care of weights that are shared across stages.
   ORT_RETURN_IF_ERROR(HandleSharedInitializer(graph, send_nodes, recv_nodes));
 
   std::set<const NodeArg*> visited_outputs;
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index 20122d378a246..2a8d2de982e79 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -1737,7 +1737,7 @@ void RegisterTrainingOpSchemas() {
         propagateShapeAndTypeFromFirstInput(ctx);
       });
 
-  // TODO: Depreacate this schema when training support is udpated to opset-12
+  // TODO: Depreacate this schema when training support is updated to opset-12
   ONNX_CONTRIB_OPERATOR_SCHEMA(GatherND)
       .SetDomain(kOnnxDomain)
       .SinceVersion(1)
@@ -1820,7 +1820,7 @@ Example 4:
       .Input(0, "shape", "The shape of source data input of GatherND.", "T1")
       .Input(1, "indices", "Tensor of rank q >= 1.", "Tind")
       .Input(2, "update", "The gradient of the output.", "T")
-      .Output(0, "output", "Tensor graident of the input.", "T")
+      .Output(0, "output", "Tensor gradient of the input.", "T")
       .TypeConstraint(
           "T",
           {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
@@ -2493,7 +2493,7 @@ Example 4:
       .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
       .SetDoc(
           "Returns the reduction axes for computing gradients of s0 op s1 with broadcast."
-          "The ouput axes are deterministic from last to first. "
+          "The output axes are deterministic from last to first. "
           "Output is an empty vector when no reduction is necessary for the corresponding input.")
       .Input(0, "a_shape", "The 1st input shape as Tensor.", "T")
       .Input(1, "b_shape", "The 2nd input shape as Tensor.", "T")
@@ -2530,7 +2530,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistBinarizeDecoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2568,7 +2568,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPack1Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "1 bit compresssed input", "T1")
+      .Input(0, "X", "1 bit compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2606,7 +2606,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPack8Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -2682,7 +2682,7 @@ Example 4:
   ONNX_CONTRIB_OPERATOR_SCHEMA(GistPackMsfp15Decoder)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
-      .Input(0, "X", "compresssed input", "T1")
+      .Input(0, "X", "compressed input", "T1")
       .Output(0, "Y", "uncompressed output", "T")
       .Attr("to",
             "The data type to which the elements of the input tensor are cast. "
@@ -3191,7 +3191,7 @@ Return true if all elements are true and false otherwise.
             "Strictly must be one of the types from DataType enum in TensorProto",
             AttributeProto::INT)
       .Attr("fuse_outputs",
-            "If true, fuse all outputs into one continous buffer.",
+            "If true, fuse all outputs into one continuous buffer.",
             AttributeProto::INT,
             static_cast<int64_t>(0))
       .TypeConstraint(
@@ -3240,7 +3240,7 @@ Return true if all elements are true and false otherwise.
       .Input(1, "scale", "Scale scalar tensor.", "ScaleT")
       .Output(0, "output", "The scaled output tensor.", "T")
       .Attr("scale_down",
-            "If true, the output tensor is input tensor devided by scale, "
+            "If true, the output tensor is input tensor divided by scale, "
             "otherwise, it's input tensor multiplied by scale. "
             "The default value is false.",
             AttributeProto::INT,
@@ -3636,7 +3636,7 @@ Return true if all elements are true and false otherwise.
           fail_shape_inference("RecordEvent must have at least (num_outputs + 1) inputs.");
 
         // note: if num_input > num_output + 1,
-        // the additional inputs (idx >= num_ouput + 1) are regarded as dependencies
+        // the additional inputs (idx >= num_output + 1) are regarded as dependencies
         // which are only used for maintain topological order
         for (size_t i = 0; i < ctx.getNumOutputs(); ++i) {
           propagateElemTypeFromInputToOutput(ctx, i + 1, i);
@@ -3689,7 +3689,7 @@ Return true if all elements are true and false otherwise.
           fail_shape_inference("WaitEvent must have at least 1 output.");
 
         // note: if num_input > num_output + 1,
-        // the additional inputs (idx >= num_ouput + 1) are regarded as dependencies
+        // the additional inputs (idx >= num_output + 1) are regarded as dependencies
         // which are only used for maintain topological order
         for (size_t i = 0; i < ctx.getNumOutputs(); ++i) {
           propagateElemTypeFromInputToOutput(ctx, i + 1, i);
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_config.h b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
index c496e36689de1..a2b44689f9ef0 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_config.h
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_config.h
@@ -17,7 +17,7 @@ struct TrainingGraphTransformerConfiguration : public GraphTransformerConfigurat
   bool attn_dropout_recompute{false};
   // Enable recompute of Gelu activation output to save memory
   bool gelu_recompute{false};
-  // Enable recompute of transformer layer ouput to save memory
+  // Enable recompute of transformer layer output to save memory
   bool transformer_layer_recompute{false};
   // Number of layers to apply recompute
   int number_recompute_layers{0};
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index 1bf08fa55ca88..87a7cbc0375a4 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -1425,7 +1425,7 @@ std::unordered_set<std::string> TrainingSession::GetTrainableModelInitializers(
 
 #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
 // Create NCCL's communication plan. In runtime, we will provide details such
-// as pointer to sent/recieved data and the size of the data in byte. See how
+// as pointer to sent/received data and the size of the data in byte. See how
 // Send and Recv call SubmitSendAndWait and SubmitRecvAndWait, respectively.
 void PipelineTrainingSession::LaunchNcclService(const int pipeline_stage_id) {
   ORT_ENFORCE(pipeline_stage_id >= 0, "Pipeline stage ID cannot be negative.");
@@ -1444,7 +1444,7 @@ void PipelineTrainingSession::LaunchNcclService(const int pipeline_stage_id) {
         // In this time slot, stage "pipeline_stage_id" sendss data to "task.peer_rank".
         nccl_service.PlanSend(task.peer_rank);
       } else if (task.type == pipeline::PipelineTask::Type::Recv) {
-        // In this time slot, stage "pipeline_stage_id" recieves data from "task.peer_rank".
+        // In this time slot, stage "pipeline_stage_id" receives data from "task.peer_rank".
         nccl_service.PlanRecv(task.peer_rank);
       }
     }
diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc
index 33d0d0346a48a..22cdd9351a206 100644
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@@ -204,12 +204,14 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
       ("data_parallel_size", "Data parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("horizontal_parallel_size", "Horizontal model parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("pipeline_parallel_size", "Number of pipeline stages.", cxxopts::value<int>()->default_value("1"))
-      ("pipeline_stage_paths", "Specify the forward ONNX files for pipeline evaluation.", cxxopts::value<std::vector<std::string>>()->default_value(""))
-      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info of "
-      "size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the first "
-      "cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each CutEdge is "
-      "seperated by ':'. If consumer nodes need to be specified, specify them after producer node with a '-' delimiter and "
-      "separate each consumer node with a '/'. ", cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("pipeline_stage_paths", "Specify the forward ONNX files for pipeline evaluation.",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info "
+      "of size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the "
+      "first cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each "
+      "CutEdge is separated by ':'. If consumer nodes need to be specified, specify them after producer node with a "
+      "'-' delimiter and separate each consumer node with a '/'. ",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
       ("enable_grad_norm_clip", "Specify whether to enable gradient clipping for optimizers.",
         cxxopts::value<bool>()->default_value("true"))
       ("enable_gelu_approximation", "Specify whether to enable GELU approximation.",
@@ -572,7 +574,7 @@ float GetLossValue(const Tensor& loss_tensor) {
 
 // use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure
 // Be mindful on the position, if it's invalid or out of bound, the property population process will be
-// either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value
+// either incorrect or aborted. Also make sure to subtract the index position by 1 to get valid correspondent value
 // namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0,
 // batch is not part of the initial tensor shape vector till later
 // see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details
diff --git a/orttraining/orttraining/models/mnist/main.cc b/orttraining/orttraining/models/mnist/main.cc
index a2fc6909a86a6..8aaa6b1ebf7f2 100644
--- a/orttraining/orttraining/models/mnist/main.cc
+++ b/orttraining/orttraining/models/mnist/main.cc
@@ -51,7 +51,8 @@ Status ParseArguments(int argc, char* argv[], MnistParameters& params) {
         cxxopts::value<std::string>()->default_value("mnist_data"))
       ("log_dir", "The directory to write tensorboard events.",
         cxxopts::value<std::string>()->default_value(""))
-      ("use_profiler", "Collect runtime profile data during this training run.", cxxopts::value<bool>()->default_value("false"))
+      ("use_profiler", "Collect runtime profile data during this training run.",
+       cxxopts::value<bool>()->default_value("false"))
       ("use_gist", "Whether to use GIST encoding/decoding.")
       ("gist_op", "Opearator type(s) to which GIST is applied.", cxxopts::value<int>()->default_value("0"))
       ("gist_compr", "Compression type used for GIST", cxxopts::value<std::string>()->default_value("GistPack8"))
@@ -66,11 +67,12 @@ Status ParseArguments(int argc, char* argv[], MnistParameters& params) {
       ("data_parallel_size", "Data parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("horizontal_parallel_size", "Horizontal model parallel group size.", cxxopts::value<int>()->default_value("1"))
       ("pipeline_parallel_size", "Number of pipeline stages.", cxxopts::value<int>()->default_value("1"))
-      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info of "
-      "size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the first "
-      "cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each CutEdge is "
-      "seperated by ':'. If consumer nodes need to be specified, specify them after producer node with a '-' delimiter and "
-      "separate each consumer node with a '/'. ", cxxopts::value<std::vector<std::string>>()->default_value(""))
+      ("cut_group_info", "Specify the cutting info for graph partition (pipeline only). An example of a cut_group_info "
+      "of size two is: 1393:407-1463/1585/1707,2369:407-2439/2561/2683. Here, the cut info is split by ',', with the "
+      "first cut_info equal to 1393:407-1463/1585/1707, and second cut_info equal to 2369:407-2439/2561/2683. Each "
+      "CutEdge is separated by ':'. If consumer nodes need to be specified, specify them after producer node with a "
+      "'-' delimiter and separate each consumer node with a '/'. ",
+      cxxopts::value<std::vector<std::string>>()->default_value(""))
       ("evaluation_period", "How many training steps to make before making an evaluation.",
         cxxopts::value<size_t>()->default_value("1"));
   // clang-format on
@@ -301,7 +303,7 @@ int main(int argc, char* args[]) {
   }
 
   if (testData->NumSamples() == 0) {
-    printf("Warning: No data loaded - run cancelled.\n");
+    printf("Warning: No data loaded - run canceled.\n");
     return -1;
   }
 
diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc
index 6421f7c81f7fb..dae6f613f4329 100644
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@@ -1188,7 +1188,7 @@ Status TrainingRunner::Evaluate(TrainingSession& session, IDataLoader& data_load
                                     fetch_names,
                                     &fetches));
 
-    // Assume that user-specified fetches are avaliable only on the last pipeline stage.
+    // Assume that user-specified fetches are available only on the last pipeline stage.
     // When there is no pipeline, all pipeline_context_.pipeline_stage_id should be 0 and
     // params_.pipeline_parallel_size is 1. Thus, the following condition is always true if there
     // is no pipeline.
diff --git a/orttraining/orttraining/python/training/onnxblock/blocks.py b/orttraining/orttraining/python/training/onnxblock/blocks.py
index 80f07c3738a7e..ed68171cc6f9c 100644
--- a/orttraining/orttraining/python/training/onnxblock/blocks.py
+++ b/orttraining/orttraining/python/training/onnxblock/blocks.py
@@ -403,12 +403,12 @@ def __init__(self, like: str):
     def build(self, input_name: Optional[str] = None):
         cloned_input = None
         with contextlib.suppress(LookupError):
-            # Supress LookupError because we want to try to get the input from the output if it's not found in the inputs
+            # Suppress LookupError because we want to try to get the input from the output if it's not found in the inputs
             cloned_input = copy.deepcopy(_graph_utils.get_input_from_input_name(self.base, self._like))
 
         if cloned_input is None:
             with contextlib.suppress(LookupError):
-                # Supress LookupError because we deal with the case where no input or output was found later.
+                # Suppress LookupError because we deal with the case where no input or output was found later.
                 cloned_input = copy.deepcopy(_graph_utils.get_output_from_output_name(self.base, self._like))
 
         if cloned_input is None:
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index 20e3493395b3d..4bc470c633437 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -194,7 +194,7 @@ def export_context():
         ),
     )
 
-# Initalized ORT's random seed with pytorch's initial seed
+# Initialized ORT's random seed with pytorch's initial seed
 # in case user has set pytorch seed before importing ORTModule
 set_seed(torch.initial_seed() % sys.maxsize)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
index 19ba6b17aba02..4e9db732b5385 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/fused_ops_frontend.cpp
@@ -10,7 +10,7 @@
 
 const size_t EMIT_NUM = 4;
 
-// This will avoid the copies when doing implict Python list <==> C++ std::vector<> conversion.
+// This will avoid the copies when doing implicit Python list <==> C++ std::vector<> conversion.
 PYBIND11_MAKE_OPAQUE(std::vector<at::Tensor>);
 
 // This function is adapted from microsoft/DeepSpeed fused_adam_frontend.cpp
@@ -150,7 +150,7 @@ void unscale_fp16_grads_into_fp32_grads(std::vector<at::Tensor>& all_fp16_params
 
   if (idx_to_fp32_from_fp16_params.size() > 0) {
     auto mem_buffer = MemoryBuffer(memory_buffer_size, idx_to_fp32_from_fp16_params.begin()->second);
-    const size_t emit_threshhold = memory_buffer_size / EMIT_NUM;
+    const size_t emit_threshold = memory_buffer_size / EMIT_NUM;
 
     size_t acc_size = 0;
     std::vector<at::Tensor> partial_new_fp32_grads;
@@ -167,7 +167,7 @@ void unscale_fp16_grads_into_fp32_grads(std::vector<at::Tensor>& all_fp16_params
       partial_new_fp32_grads.emplace_back(idx_to_fp32_from_fp16_params[idx].grad());
       partial_fp16_grads_needing_unscale.emplace_back(fp16_grads_needing_unscale[fp32_from_fp16_param_idx]);
 
-      if (acc_size > emit_threshhold || fp32_from_fp16_param_idx == idx_to_fp32_from_fp16_params.size() - 1) {
+      if (acc_size > emit_threshold || fp32_from_fp16_param_idx == idx_to_fp32_from_fp16_params.size() - 1) {
         if (partial_fp16_grads_needing_unscale.size() > 0) {
           std::vector<std::vector<at::Tensor>> tensor_lists;
           tensor_lists.emplace_back(partial_fp16_grads_needing_unscale);
diff --git a/orttraining/orttraining/test/distributed/partition_utils.h b/orttraining/orttraining/test/distributed/partition_utils.h
index 1369b493655b6..c22d0a3eb2f93 100644
--- a/orttraining/orttraining/test/distributed/partition_utils.h
+++ b/orttraining/orttraining/test/distributed/partition_utils.h
@@ -338,7 +338,7 @@ common::Status SplitGraph(Graph& graph,
   //    but nodeA, nodeB belong to parition0, nodeC belongs to parition1, and nodeD belongs to parition2.
   //    This means we need to cut edge nodeA->nodeC for the first partition and nodeA->nodeD for the second partition.
   //
-  //    During the first cut, we identify the edge nodeA->nodeC, for this edge, based on the origional node_arg,
+  //    During the first cut, we identify the edge nodeA->nodeC, for this edge, based on the original node_arg,
   //    we create a new node_arg, called updated_node_arg. The inserted send node will take the original node_arg
   //    as input and the inserted recv node will take the updated_node_arg as the output.
   //    And we update updated_node_args with updated_node_args[original_node_arg] = updated_node_arg
@@ -414,7 +414,7 @@ common::Status SplitGraph(Graph& graph,
       auto producer_node = graph.GetMutableProducerNode(id.node_arg_name);
       if (!producer_node) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cannot find producer node of node_arg with name: ", id.node_arg_name,
-                               ". Wrong cutting infomation.");
+                               ". Wrong cutting information.");
       }
 
       // once we find out the producer node for id.node_arg_name, find which output index that leads
@@ -606,7 +606,7 @@ Status CutBasedApplyPipelinePartitionToMainGraph(
     ORT_RETURN_IF_ERROR(GenerateSubgraph(graph, recv_nodes.back()));
   }
 
-  // Post check to ensure the curent partition is correct and matches with Send/Recv nodes inserted during split.
+  // Post check to ensure the current partition is correct and matches with Send/Recv nodes inserted during split.
   Node* send_node{nullptr};
   Node* recv_node{nullptr};
   for (auto& node : graph.Nodes()) {
diff --git a/orttraining/orttraining/test/graph/bert_toy_fetches.h b/orttraining/orttraining/test/graph/bert_toy_fetches.h
index 5bfc5da742cd4..71465c142f127 100644
--- a/orttraining/orttraining/test/graph/bert_toy_fetches.h
+++ b/orttraining/orttraining/test/graph/bert_toy_fetches.h
@@ -8,7 +8,7 @@
 
 namespace onnxruntime {
 namespace test {
-// Avoid this arrary being initialized on stack.
+// Avoid this array being initialized on stack.
 // Limit the number of arguments to compile with clang.
 constexpr std::array<double, 16384> bert_embeddings_position_embeddings_weight_grad = {-0.009673337, 0.015859816, -0.0060598925, 0.0061725015, 0.0686829, 0.031034196, -0.041214723, 0.04238321, -0.045230567, -0.03455956, 0.037526406, 0.019020742, -0.008562718, -0.030574083, -0.012788322, -0.0008712788, -0.041134313, 0.027024698, -0.012437805, 0.059991226, -0.026614683, -0.06257652, -0.020100333, -0.03510955, 0.05741506, 0.068152145, -0.065179504, 0.038520053, 0.019393224, 0.03954512, 0.006873767, -0.084907904, -0.0050477944, 0.0012708178, 0.0030560307, -0.032130327, -0.0144646885, -0.016298112, -0.042901997, 0.07588, 0.01613088, -0.018301323, -0.010611727, 0.005544794, -0.014955264, -0.016850606, 0.022336477, -0.0030460241, -0.014482946, 0.00859436, -0.014712406, 0.03867981, -0.022954227, 0.015440098, -0.005059921, 0.0035975706, 0.01880927, 0.062380753, 0.02279159, 0.0036130734, 0.029864375, -0.022658946, -0.0069784625, -0.06653513, -0.01116233, 0.021000436, -0.028701056, -0.024398895, 0.011476517, 0.032129377, -0.04200533, 0.05585559, 0.027091827, -0.03708192, -0.029153917, 0.014818583, -0.03863439, -0.03299714, 0.026062695, 0.027578063, -0.033457935, 0.023994414, -0.00042527216, 0.020991987, -0.043016825, 0.03330429, -0.0051043453, -0.061040144, 0.02476727, 0.07664442, -0.0109203905, 0.046167813, 0.05265824, -0.009806289, -0.032828216, -0.053807136, -0.018357445, -0.0060726395, 0.012883636, -0.03604291, -0.020931121, -0.017016709, -0.06521842, 0.09689566, 0.010757825, -0.014480298, -0.011673617, 0.014982184, -0.011422393, -0.015741495, 0.021494215, -0.013776923, -0.017716365, 0.02294489, -0.00073889084, 0.036582764, -0.013822639, 0.0075510093, -0.015371518, 0.012141101, 0.009292599, 0.0632079, 0.023068016, -0.0034772623, 0.033849746, -0.009428004, -0.0021826755, -0.07218023, -0.00040298235, 0.008162888, -0.009084097, -0.025772562, 0.01697198, 0.0096272295, -0.05384024, 0.054271728, 0.0061686123, -0.012313863, -0.010857888, 0.011092398, -0.017863888, -0.023245087, 0.0147367595, 0.0022649313, -0.0307159, 0.004318953, 0.0035282676, 0.026500994, -0.029873395, 0.0049419748, -0.007642911, -0.02280794, 0.016169535, 0.059451614, 0.015289053, 0.021232026, 0.042667653, -0.0034166733, -0.014750072, -0.05480911, 0.0012827339, -0.00061177486, 0.008855328, -0.014449824, -0.008173137, -0.033359475, -0.06602954, 0.074186556, -0.0031156093, 0.0009635263, -0.0151721025, 0.007254398, 0.015830085, 0.009578684, -0.0053947777, -0.020233134, -0.016644966, 0.002484738, -0.019542504, 0.026349604, -0.017563643, -0.005398605, 0.0013201954, 0.034780584, 0.007976923, 0.054721735, 0.015226502, -0.001414868, 0.030154174, 0.011785319, 0.0033271122, -0.07897424, 0.01796715, -0.00018319988, 0.006205301, -0.019297902, 0.03912447, 0.0022418862, -0.048669476, 0.031012537, -0.0155599145, -0.01757, -0.0011392199, 0.016611777, 0.008555129, -0.017760677, -0.02604977, 0.014489464, -0.041648414, -0.017570462, 0.005586198, 0.03271513, -0.04649407, -0.038035538, 2.2510882e-05, -0.006990753, 0.043797504, 0.0970251, 0.0041649155, 0.020328937, 0.058848612, -0.008414367, -0.026458042, -0.06685481};
 static std::unordered_map<std::string, std::vector<double>> BERT_TOY_FETCHES = {
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 541473b1561db..6f5b03685e801 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -264,7 +264,7 @@ class UnusedBeginParameterNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
         super().__init__()
 
-        # fc1 is an unused initializer (which is in the begining of initializer list)
+        # fc1 is an unused initializer (which is in the beginning of initializer list)
         # which will be dropped after export
         self.fc1 = torch.nn.Linear(input_size, hidden_size1)
         self.relu = torch.nn.ReLU()
diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py
index dcc9e116fda7d..4378118b71b9f 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/qat.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py
@@ -24,7 +24,7 @@
     onnx.save(onnx_model, os.path.join(model_dir, f"{model_name}.onnx"))
 
     logging.info(
-        "Begining Quantization process for model saved at: %s",
+        "Beginning Quantization process for model saved at: %s",
         os.path.join(model_dir, f"{model_name}.onnx"),
     )
     logging.info("Skipping model preprocessing step. As QAT requires a un preprocessed model.")
diff --git a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
index d36f9b307ec70..61bd9c19f3541 100644
--- a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
@@ -1036,7 +1036,7 @@ TEST(CrossEntropyTest, SoftmaxCrossEntropyLossInternalGrad_TinySizeTensorFloatIn
   std::vector<int64_t> index_dims{8};
   std::vector<int64_t> weight_dims{2};
   std::vector<int64_t> dX_dims{8, 2};
-  // Set run_cpu_baseline_seperately = True because CPU kernel did not support multiple type support
+  // Set run_cpu_baseline_separately = True because CPU kernel did not support multiple type support
   // for input and output.
   TestSoftmaxCrossEntropyLossInternalGrad<float, MLFloat16>(dY_dims, log_prob_dims, index_dims, weight_dims,
                                                             dX_dims, "mean", -1, 5e-2, false /*has_bias*/);
diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc
index 4647f890729f4..e42752b3a2d55 100644
--- a/orttraining/orttraining/training_api/optimizer.cc
+++ b/orttraining/orttraining/training_api/optimizer.cc
@@ -205,7 +205,7 @@ Optimizer::Optimizer(const ModelIdentifiers& model_identifiers,
       // by invoking ConstructOptimizerStateAndInputs().
       ORT_THROW_IF_ERROR(ConstructOptimizerStateAndInputs());
     } else {
-      delay_optimizer_state_contruction_ = true;
+      delay_optimizer_state_construction_ = true;
     }
   } else {
     ORT_THROW_IF_ERROR(LoadStateDict(state_->optimizer_checkpoint_state));
@@ -256,7 +256,7 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers,
 }
 
 Status Optimizer::Step() {
-  if (delay_optimizer_state_contruction_) {
+  if (delay_optimizer_state_construction_) {
     ORT_RETURN_IF_ERROR(ConstructOptimizerStateAndInputs());
   }
 
@@ -343,7 +343,7 @@ Status Optimizer::ConstructOptimizerStateAndInputs() {
   ORT_RETURN_IF_ERROR(GenerateMomentumNamedStates(state_->optimizer_checkpoint_state));
   ORT_RETURN_IF_ERROR(ConstructInputs());
 
-  delay_optimizer_state_contruction_ = false;
+  delay_optimizer_state_construction_ = false;
 
   return Status::OK();
 }
diff --git a/orttraining/orttraining/training_api/optimizer.h b/orttraining/orttraining/training_api/optimizer.h
index 5b908acf7c9e3..a0717563a8bd0 100644
--- a/orttraining/orttraining/training_api/optimizer.h
+++ b/orttraining/orttraining/training_api/optimizer.h
@@ -166,7 +166,7 @@ struct Optimizer {
 
   int32_t group_count_{0};
 
-  bool delay_optimizer_state_contruction_{false};
+  bool delay_optimizer_state_construction_{false};
 };
 
 }  // namespace api
diff --git a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
index d3f2f9c7a8767..40497467a31a5 100644
--- a/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
+++ b/orttraining/orttraining/training_ops/cpu/activation/activations_grad.cc
@@ -82,7 +82,7 @@ Status ComputeGeluGradDX(gsl::span<const T> dY, gsl::span<const T> X, gsl::span<
   static constexpr T kBeta = static_cast<T>(kGamma * kAlpha * 3.0f);
 
   //
-  // Commented out EIGEN implentation due to EIGEN bug.
+  // Commented out EIGEN implementation due to EIGEN bug.
   // On Windows Release build with GPU enabled, kAlpha * EIGEN_X below would produce pure 0
   // result, even though neither kAlpha nor EIGEN_X is zero.
   // Given that CPU kernel is mostly for conformance check, where performance is not of high
diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
index 56520337fe683..a468c756ef74d 100644
--- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu
@@ -10,7 +10,7 @@
 namespace onnxruntime {
 namespace cuda {
 
-// for now this operator classes are no different than a funciton.
+// for now this operator classes are no different than a function.
 // Eventually once multiple binary gradient ops are needed, we will pass
 // its instance from API instead of direct function call.
 template <class T>
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
index 501c48e687e98..1152c98447444 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc
@@ -582,7 +582,7 @@ Status LambOptimizer<T1, T2, T3, T4, T_GRAD_NORM, T_MIXED_PRECISION_FP>::Compute
     // Allocate a buffer in byte for reduction API calls.
     size_t rbs = compute_reduction_buffer_size<CudaT2>(max_tensor_size);
 
-    // Enlarge reduction buffer to accomodate multi-tensor reduction kernel as well
+    // Enlarge reduction buffer to accommodate multi-tensor reduction kernel as well
     constexpr int tensor_group_size = 4;  // w, d, w_norm, d_norm
     constexpr int max_blocks = ChunkGroup<tensor_group_size>::max_block_count;
     constexpr size_t multitensor_block_reduce_buffer_size = 2 * max_blocks * sizeof(CudaT2);
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
index fd55f7c30ff75..f59f5f7dc9c33 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb_impl.cu
@@ -192,7 +192,7 @@ __device__ __forceinline__ void _LambUpdateRule(
     T2* w_new,
     T3* g_new,
     T_MIXED_PRECISION_FP* w_mixed_precision_new) {
-  // Confidence coefficeint of this update.
+  // Confidence coefficient of this update.
   const T2 ratio = (w_norm != T2(0.0f) && r_norm != T2(0.0f)) ? T2(eta) * _Max(T2(ratio_min), _Min(T2(ratio_max), _Sqrt(w_norm / r_norm))) : T2(eta);
 
   // Compute delta using the saved update direction.
diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py
index b397d1d26a456..bc6fe0eaf8b29 100644
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@@ -164,7 +164,7 @@ def main():
     vocab_size = 30528
 
     # Create a fake data point.
-    vocab_size = 30528  # It shoudl match the value from BERT config file.
+    vocab_size = 30528  # It should match the value from BERT config file.
     input_ids = np.random.randint(low=0, high=vocab_size, size=(batch, sq_length), dtype=np.int64)
     segment_ids = np.random.randint(low=0, high=2, size=(batch, sq_length), dtype=np.int64)
     input_mask = np.ones((batch, sq_length), dtype=np.int64)
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index f0cf53990eac3..2fb1936ff2184 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -269,7 +269,7 @@ def process_dropout(model):
         del model.graph.node[d]
 
 
-# Also need to set following line differently for differnt verison of bert
+# Also need to set following line differently for different version of bert
 # expand_out.name = '412'
 def add_expand_shape(model):
     expand_out = model.graph.value_info.add()
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index e8c2263a39c32..790bdc34e1ff7 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 #
 # This converter is an internal util to upgrade existing bert/gpt-2 models,
-# which were previously transformed/optimized from orginal model, to Opset 12
+# which were previously transformed/optimized from original model, to Opset 12
 # version as well as replacing deprecated node, i.e., TrainableDropout with
 # the "Dropout" node matching the Opset 12 Spec. Typically, a model to
 # be run by this scripts would have "_optimized" substring in its model name,
diff --git a/rust/onnxruntime-sys/examples/c_api_sample.rs b/rust/onnxruntime-sys/examples/c_api_sample.rs
index e8c9ca8f09a5a..3cfb9d76029a0 100644
--- a/rust/onnxruntime-sys/examples/c_api_sample.rs
+++ b/rust/onnxruntime-sys/examples/c_api_sample.rs
@@ -31,8 +31,8 @@ fn main() {
     assert_ne!(g_ort, std::ptr::null_mut());
 
     //*************************************************************************
-    // initialize  enviroment...one enviroment per process
-    // enviroment maintains thread pools and other state info
+    // initialize  environment...one environment per process
+    // environment maintains thread pools and other state info
     let mut env_ptr: *mut OrtEnv = std::ptr::null_mut();
     let env_name = std::ffi::CString::new("test").unwrap();
     let status = unsafe {
diff --git a/rust/onnxruntime/src/tensor/ort_output_tensor.rs b/rust/onnxruntime/src/tensor/ort_output_tensor.rs
index 006fbdba6cdb8..83663c0d303f8 100644
--- a/rust/onnxruntime/src/tensor/ort_output_tensor.rs
+++ b/rust/onnxruntime/src/tensor/ort_output_tensor.rs
@@ -70,7 +70,7 @@ impl Drop for OrtOutputTensor {
     }
 }
 
-/// An Ouput tensor with the ptr and the item that will copy from the ptr.
+/// An Output tensor with the ptr and the item that will copy from the ptr.
 #[derive(Debug)]
 pub struct WithOutputTensor<'a, T> {
     #[allow(dead_code)]
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 54f7b6c3a8fa7..98d9ba22b7190 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -718,7 +718,7 @@ def convert_arg_line_to_args(self, arg_line):
 
     # Code coverage
     parser.add_argument(
-        "--code_coverage", action="store_true", help="Generate code coverage when targetting Android (only)."
+        "--code_coverage", action="store_true", help="Generate code coverage when targeting Android (only)."
     )
 
     # lazy tensor support.
@@ -2749,7 +2749,7 @@ def main():
                         cmake_extra_args += ["-D", "BUILD_AS_ARM64X=ARM64EC"]
                 cmake_extra_args += ["-G", args.cmake_generator]
                 # Cannot test on host build machine for cross-compiled
-                # builds (Override any user-defined behaviour for test if any)
+                # builds (Override any user-defined behavior for test if any)
                 if args.test:
                     log.warning(
                         "Cannot test on host build machine for cross-compiled "
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index 24809ccfdec1f..036becb7df077 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -35,7 +35,7 @@ parameters:
   default: 'nightly (@dev)'
 
 variables:
-  # pipeline should define the following varaibles
+  # pipeline should define the following variables
   #   ExtraBuildArgs
   #   VersionSuffix
 
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index fd25d8bc147cd..dd2dcce01bf4a 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -30,7 +30,7 @@ def filename(name, windows_extension):
     sdk_root = Path(sdk_root).resolve(strict=True)
 
     return SdkToolPaths(
-        # do not use sdk_root/tools/emulator as that is superceeded by sdk_root/emulator/emulator
+        # do not use sdk_root/tools/emulator as that is superseded by sdk_root/emulator/emulator
         emulator=str((sdk_root / "emulator" / filename("emulator", "exe")).resolve(strict=True)),
         adb=str((sdk_root / "platform-tools" / filename("adb", "exe")).resolve(strict=True)),
         sdkmanager=str(
diff --git a/winml/adapter/winml_adapter_session.cpp b/winml/adapter/winml_adapter_session.cpp
index fa91978b564ba..5e27d8fb9a985 100644
--- a/winml/adapter/winml_adapter_session.cpp
+++ b/winml/adapter/winml_adapter_session.cpp
@@ -310,7 +310,7 @@ ORT_API_STATUS_IMPL(
   winrt::Windows::Foundation::Collections::IMap<winrt::hstring, uint32_t> override_map =
     winrt::single_threaded_map<winrt::hstring, uint32_t>();
   for (auto freeDimOverride : session_options.free_dimension_overrides) {
-    if (freeDimOverride.dim_identifer_type == onnxruntime::FreeDimensionOverrideType::Name) {
+    if (freeDimOverride.dim_identifier_type == onnxruntime::FreeDimensionOverrideType::Name) {
       override_map.Insert(
         winrt::to_hstring(freeDimOverride.dim_identifier), static_cast<uint32_t>(freeDimOverride.dim_value)
       );
diff --git a/winml/api/Microsoft.AI.MachineLearning.Experimental.idl b/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
index ad39a1ed7e684..3322c76f6eef2 100644
--- a/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
+++ b/winml/api/Microsoft.AI.MachineLearning.Experimental.idl
@@ -128,7 +128,7 @@ namespace ROOT_NS.AI.MachineLearning.Experimental {
       Boolean CloseModelOnJoin { get; set; };
 
       //! The JoinedNodePrefix property specifies whether the nodes of the second model should have a specific prefixed in the joined model.
-      //! Node names must be unique or empty. By enabling this, the engine can specifiy the prefix, or eliminate it entirely in cases
+      //! Node names must be unique or empty. By enabling this, the engine can specify the prefix, or eliminate it entirely in cases
       //! where the model is known to contain no duplicate node names.
       //! The default value for CloseModelOnJoin is a new random GUID.
       String JoinedNodePrefix { get; set; };
diff --git a/winml/api/Windows.AI.MachineLearning.idl b/winml/api/Windows.AI.MachineLearning.idl
index 2b55fa8c7a95c..59c58ba80efca 100644
--- a/winml/api/Windows.AI.MachineLearning.idl
+++ b/winml/api/Windows.AI.MachineLearning.idl
@@ -9,7 +9,7 @@ import "windows.media.idl";
 #ifndef WINDOWSAI_RAZZLE_BUILD
 // Pull in definition for DualApiPartitionAttribute, because the WinML IDL
 // does not build in the OS Repo, and needs to access internal definitions for
-// various custom attirbute definitions.
+// various custom attribute definitions.
 import "dualapipartitionattribute.idl";
 import "windows.graphics.directx.direct3d11.idl";
 import "windows.graphics.imaging.idl";
diff --git a/winml/lib/Api/LearningModelBinding.cpp b/winml/lib/Api/LearningModelBinding.cpp
index 17440f6f0a561..222fdba986dcb 100644
--- a/winml/lib/Api/LearningModelBinding.cpp
+++ b/winml/lib/Api/LearningModelBinding.cpp
@@ -30,7 +30,7 @@ static winml::ILearningModelFeatureDescriptor FindValidBinding(
     uint32_t size;
     WINML_THROW_IF_FAILED(descriptor_native->GetName(&feature_name, &size));
 
-    // Case insensetive comparison of onnx name in feature descriptor, and passed in name
+    // Case insensitive comparison of onnx name in feature descriptor, and passed in name
     if (_wcsicmp(feature_name, name.c_str()) == 0) {
       return descriptor;
     }
diff --git a/winml/lib/Api/impl/NumericData.h b/winml/lib/Api/impl/NumericData.h
index 71c61b3c29f6f..129c7cbf1f294 100644
--- a/winml/lib/Api/impl/NumericData.h
+++ b/winml/lib/Api/impl/NumericData.h
@@ -15,7 +15,7 @@ class numeric_data : public _winml::idata {
     size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers
   );
 
-  // Privte constructor as this type should be created as a shared_ptr
+  // Private constructor as this type should be created as a shared_ptr
   numeric_data(size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers);
   gsl::span<byte> buffer_at(size_t index);
   gsl::span<byte> combined_buffer();
diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
index d6e70e35e3a6d..587f3e28928ae 100644
--- a/winml/test/api/LearningModelSessionAPITest.cpp
+++ b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -315,7 +315,7 @@ static void NamedDimensionOverride() {
   LearningModelDevice device(nullptr);
   WINML_EXPECT_NO_THROW(device = LearningModelDevice(LearningModelDeviceKind::Cpu));
 
-  // the model input shape. the batch size, n, is overriden to 5
+  // the model input shape. the batch size, n, is overridden to 5
   uint32_t n = 5;
   int64_t c = 3, h = 720, w = 720;
 
diff --git a/winml/test/common/googleTestMacros.h b/winml/test/common/googleTestMacros.h
index 2f493c9b6d6b9..111abd1c3914e 100644
--- a/winml/test/common/googleTestMacros.h
+++ b/winml/test/common/googleTestMacros.h
@@ -64,7 +64,7 @@
 #define INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
 #endif
 
-#define WINML_SKIP_TEST(message) WINML_SUPRESS_UNREACHABLE_BELOW(GTEST_SKIP() << message)
+#define WINML_SKIP_TEST(message) WINML_SUPPRESS_UNREACHABLE_BELOW(GTEST_SKIP() << message)
 
 #define WINML_EXPECT_NO_THROW(statement) EXPECT_NO_THROW(statement)
 #define WINML_EXPECT_TRUE(statement) EXPECT_TRUE(statement)
diff --git a/winml/test/common/taefTestMacros.h b/winml/test/common/taefTestMacros.h
index 48119ff293fc8..3f6377c0a56b2 100644
--- a/winml/test/common/taefTestMacros.h
+++ b/winml/test/common/taefTestMacros.h
@@ -48,7 +48,7 @@ using namespace WEX::TestExecution;
   }
 
 #define WINML_SKIP_TEST(message)                                                                                       \
-  WINML_SUPRESS_UNREACHABLE_BELOW(                                                                                     \
+  WINML_SUPPRESS_UNREACHABLE_BELOW(                                                                                    \
     Log::Result(TestResults::Skipped, std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(message).c_str()); \
     return;                                                                                                            \
   )
diff --git a/winml/test/common/test.h b/winml/test/common/test.h
index f5adce2b40602..b7afa5dbb5f21 100644
--- a/winml/test/common/test.h
+++ b/winml/test/common/test.h
@@ -18,9 +18,9 @@ constexpr bool alwaysTrue() {
 constexpr bool alwaysFalse() {
   return false;
 }
-#define WINML_SUPRESS_UNREACHABLE_BELOW(statement) \
-  if (alwaysTrue()) {                              \
-    statement;                                     \
+#define WINML_SUPPRESS_UNREACHABLE_BELOW(statement) \
+  if (alwaysTrue()) {                               \
+    statement;                                      \
   }
 
 #ifdef BUILD_TAEF_TEST
diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp
index 2251954c59e4c..b408c0315f94a 100644
--- a/winml/test/image/imagetests.cpp
+++ b/winml/test/image/imagetests.cpp
@@ -211,12 +211,12 @@ class ImageTests : public ::testing::Test {
   bool ShouldSkip(
     const std::wstring& model_file_name, const std::wstring& image_file_name, const InputImageSource input_image_source
   ) {
-    // Case that the tensor's shape doesn't match model's shape should be skiped
+    // Case that the tensor's shape doesn't match model's shape should be skipped
     if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && (InputImageSource::FromGPUResource == input_image_source || InputImageSource::FromCPUResource == input_image_source)) {
       return true;
     }
 
-    // Case that the images's shape doesn't match model's shape which expects free dimension should be skiped.
+    // Case that the images's shape doesn't match model's shape which expects free dimension should be skipped.
     // Because the fns-candy is not real model that can handle free dimensional input
     if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) {
       return true;
diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index 27d74d7d6b034..859914014b8bb 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -170,7 +170,7 @@ std::string GetTestDataPath() {
     testDataPath.replace(environmentVariableFetchSuceeded, testDataPathFolderName.length(), testDataPathFolderName);
   } else {
     throw std::exception(
-      "WINML_TEST_DATA_PATH environment variable path needs to be shorter to accomodate the maximum path size of %d\n",
+      "WINML_TEST_DATA_PATH environment variable path needs to be shorter to accommodate the maximum path size of %d\n",
       MAX_PATH
     );
   }

From 11ad29945125d3f12f5935570fdd1f48bb0285d1 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Mon, 22 Jul 2024 16:37:04 -0700
Subject: [PATCH 32/35] Adds ATen fallback for scaled_dot_product_attention
 (#21107)

### Description
<!-- Describe your changes. -->

Introduces an ATen fallback for
`torch.nn.functional.scaled_dot_product_attention`. This operator was
introduced in torch 2.0 and, since then, has had many updates including
the implementation of memory efficient attention for V100 machines. The
current torchscript exporter exports a subgraph for attention which does
not provide the same memory savings that PyTorch's memory efficient
attention kernel provides. Allowing fallback to PyTorch ATen op for
attention helps mitigate memory spike issues for models leveraging
memory efficient attention.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Memory issues arose when integrating ONNX Runtime Training with AML
Stable Diffusion.

---------

Co-authored-by: root <prathikrao@microsoft.com>
---
 docs/ORTModule_Training_Guidelines.md         | 10 +++
 .../core/graph/gradient_builder.cc            | 15 +++-
 .../ortmodule/_custom_gradient_registry.py    | 37 ++++++++++
 .../ortmodule/_custom_op_symbolic_registry.py | 24 +++++++
 .../python/orttraining_test_ortmodule_api.py  | 71 +++++++++++++++++++
 5 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index 8d5472ba30601..c79ba59a07ee9 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -304,6 +304,16 @@ A classical usage of disabling the deep copy: when the deep copy before module e
 	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
 	```
 
+#### ORTMODULE_ATEN_SDPA_FALLBACK
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, this is disabled. This env var can be used for enabling pre-export attention fall back to PyTorch's [_scaled_dot_product_efficient_attention](https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14778) ATen kernel for execution when calling torch.nn.functional.scaled_dot_product_attention. NOTE: only use this feature if user model leverages memory efficient attention WITHOUT masking (ie. attn_mask=None). Utilize GPU profiling looks like NVIDIA Nsight Systems to identify if user model leverages memory efficient attention.
+
+    ```bash
+    export ORTMODULE_ATEN_SDPA_FALLBACK=1 # ENABLE
+    unset ORTMODULE_ATEN_SDPA_FALLBACK # DISABLE
+    ```
+
 ### 2.2 Memory Optimization
 
 Q: *Want to run a bigger batch size?*
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 22dcf4eb92411..76fe0ee91d4c6 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1794,7 +1794,20 @@ IMPLEMENT_GRADIENT_BUILDER(GetExternalGradient) {
     }
 
     std::vector<ArgDef> output_args;
-    for (const auto& output : node_def.outputs) {
+    for (size_t output_index = 0; output_index < node_def.outputs.size(); ++output_index) {
+      // If the input is not used in the forward computation, we don't need it for gradient computation
+      // Required for ORTMODULE_ATEN_SDPA_FALLBACK
+      if (static_cast<int>(output_index) >= GetSrcNodeInputSize()) {
+        continue;
+      }
+
+      if (!IsGradientRequiredForSrcNodeInput(static_cast<int>(output_index))) {
+        output_args.emplace_back(ArgDef());
+        continue;
+      }
+
+      const auto& output = node_def.outputs[output_index];
+
       if (output.find("GI(") == 0) {
         size_t index = static_cast<size_t>(std::stoi(output.substr(3, output.length() - 4)));
         output_args.emplace_back(GI(index));
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index a8590cea22887..97650f509ac88 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -25,6 +25,7 @@
 #                     'is_tensor' is optional, if not present, the default is False.
 
 import json
+import os
 
 from onnxruntime.capi import _pybind_state as C
 
@@ -276,3 +277,39 @@ def upsample_nearest3d_gradient():
 @register_gradient("org.pytorch.aten", "ATen", "upsample_bicubic2d", "vec")
 def upsample_bicubic2d_gradient():
     return _upsample_gradient("upsample_bicubic2d_backward", 2)
+
+
+ATEN_SDPA_FALLBACK = os.getenv("ORTMODULE_ATEN_SDPA_FALLBACK", None)
+if ATEN_SDPA_FALLBACK:
+    # based on the following internal PyTorch kernel for efficient attention:
+    # https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14784
+    @register_gradient("org.pytorch.aten", "ATen", "_scaled_dot_product_efficient_attention", "")
+    def scaled_dot_product_attention_gradient():
+        return [
+            (
+                "Constant",
+                [],
+                ["grad_input_mask"],
+                {"value": {"value": [1, 1, 1, 0], "dtype": "int", "is_tensor": True}},
+            ),
+            (
+                ("ATen", "org.pytorch.aten"),
+                [
+                    "GO(0)",
+                    "I(0)",
+                    "I(1)",
+                    "I(2)",
+                    "I(3)",
+                    "O(0)",
+                    "O(1)",
+                    "O(2)",
+                    "O(3)",
+                    "I(5)",
+                    "grad_input_mask",
+                    "I(6)",
+                    "I(7)",
+                ],
+                ["GI(0)", "GI(1)", "GI(2)", ""],
+                {"operator": {"value": "_scaled_dot_product_efficient_attention_backward", "dtype": "string"}},
+            ),
+        ]
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 10e7f60b7da0f..c48968efbb262 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import os
 from typing import Callable
 
 import torch
@@ -969,3 +970,26 @@ def softmax(g, input, dim, dtype=None):
     softmax = g.op("Softmax", casted_input, axis_i=dim)
 
     return softmax
+
+
+ATEN_SDPA_FALLBACK = os.getenv("ORTMODULE_ATEN_SDPA_FALLBACK", None)
+if ATEN_SDPA_FALLBACK:
+    # based on the following internal PyTorch kernel for efficient attention:
+    # https://github.com/pytorch/pytorch/blob/c12a4f2e65ad41b739aab1a261e2336b4a79fcfb/aten/src/ATen/native/native_functions.yaml#L14778
+    @register_symbolic("scaled_dot_product_attention")
+    def scaled_dot_product_attention(g, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None):
+        dropout_p_f = g.op("Cast", dropout_p, to_i=torch.onnx.TensorProtoDataType.FLOAT)
+        compute_logsumexp = g.op("Constant", value_t=torch.tensor([1], dtype=torch.bool))
+        return g.op(
+            "org.pytorch.aten::ATen",
+            query,
+            key,
+            value,
+            attn_mask,
+            compute_logsumexp,
+            dropout_p_f,
+            is_causal,
+            scale,
+            operator_s="_scaled_dot_product_efficient_attention",
+            outputs=4,
+        )[0]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 6f5b03685e801..fe59c398d7abb 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -6953,3 +6953,74 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     else:
         if "ORTMODULE_MEMORY_OPT_LEVEL" in os.environ:
             del os.environ["ORTMODULE_MEMORY_OPT_LEVEL"]
+
+
+@pytest.mark.skipif(
+    Version(torch.__version__) < Version("2.3.0"),
+    reason="torch.nn.attention module was introduced in PyTorch 2.3.0",
+)
+def test_aten_attention():
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+
+    class _NeuralNetAttention(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, q, k, v, attn_mask=None):
+            with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
+                return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask)
+
+    def gen_inputs(device, dtype):
+        return [
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+            torch.randn(32, 8, 128, 64, dtype=dtype, device=device, requires_grad=True),
+        ]
+
+    def run_step(model, inputs, attn_mask=None):
+        prediction = model(*inputs, attn_mask)
+        prediction.sum().backward()
+        return prediction
+
+    device = "cuda"
+
+    os.environ["ORTMODULE_ATEN_SDPA_FALLBACK"] = "1"  # TESTING WITHOUT ATTN_MASK
+
+    pt_model = _NeuralNetAttention().to(device)
+    ort_model = ORTModule(copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="mem_eff_attn"))
+
+    # reset manual seed to reset the generator
+    torch.manual_seed(2333)
+    pt_input = gen_inputs(device=device, dtype=torch.float32)
+    ort_input = copy.deepcopy(pt_input)
+    pt_prediction = run_step(pt_model, pt_input)
+    ort_prediction = run_step(ort_model, ort_input)
+
+    _test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
+    _test_helpers.assert_values_are_close(ort_input[0].grad, pt_input[0].grad)
+    _test_helpers.assert_values_are_close(ort_input[1].grad, pt_input[1].grad)
+    _test_helpers.assert_values_are_close(ort_input[2].grad, pt_input[2].grad)
+
+    execution_mgr = ort_model._torch_module._execution_manager._training_manager
+    from onnxruntime.training.ortmodule._onnx_models import _get_onnx_file_name
+
+    path = os.path.join(
+        execution_mgr._debug_options.save_onnx_models.path,
+        _get_onnx_file_name(
+            execution_mgr._debug_options.save_onnx_models.name_prefix, "execution_model", execution_mgr._export_mode
+        ),
+    )
+
+    onnx_model = onnx.load(path)
+    onnx_nodes = onnx_model.graph.node
+
+    mem_eff_attn_nodes = 0
+    for node in onnx_nodes:
+        if "ATen" in node.name:
+            for attr in node.attribute:
+                if b"_scaled_dot_product_efficient_attention" in attr.s:
+                    mem_eff_attn_nodes += 1
+
+    assert mem_eff_attn_nodes > 0, "No mem_eff_attn nodes are found"
+
+    del os.environ["ORTMODULE_ATEN_SDPA_FALLBACK"]

From dd010edb37c8c7b34ba5d40cdfb3f6ce0a0fa789 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Mon, 22 Jul 2024 16:59:03 -0700
Subject: [PATCH 33/35] Update DirectML from 1.14.1 to 1.15.0 (#21323)

Update DirectML from 1.14.1 to 1.15.0

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
Co-authored-by: Dwayne Robinson <dwayner@microsoft.com>
---
 .pipelines/nuget_config/x64/packages.config   |  2 +-
 .pipelines/nuget_config/x86/packages.config   |  2 +-
 cmake/external/dml.cmake                      |  2 +-
 docs/OperatorKernels.md                       |  8 ++-
 .../DmlExecutionProvider/src/ApiTraits.cpp    |  8 ++-
 .../src/External/DirectMLHelpers/ApiHelpers.h |  8 ++-
 .../src/External/DirectMLHelpers/ApiTraits.h  | 57 ++++++++++++++++--
 .../External/DirectMLHelpers/DirectMLSchema.h | 58 ++++++++++++++++++
 .../DirectMLHelpers/DmlGraphDeserialization.h |  2 +-
 .../DirectMLHelpers/GeneratedSchemaHelpers.h  | 60 ++++++++++++++++++-
 .../DirectMLHelpers/GeneratedSchemaTypes.h    | 55 +++++++++--------
 .../src/Operators/DmlOperatorResize.cpp       | 21 ++-----
 .../OperatorAuthorHelper/OperatorHelper.cpp   | 11 ++--
 packages.config                               |  2 +-
 .../nuget/generate_nuspec_for_native_nuget.py |  2 +-
 15 files changed, 231 insertions(+), 67 deletions(-)

diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 9066e13ee1c8d..7bf8181b1f838 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index a8e5b35b28b36..30f7862a11078 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index f74b694471203..54e361ffdb3ae 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.0)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index df5897529baae..ed944b5a6df79 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -970,6 +970,7 @@ Do not modify directly.*
 |||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(float), tensor(float16)|
 |||6+|**T** = tensor(float), tensor(float16)|
+|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Concat|*in* inputs:**T**<br> *out* concat_result:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||4+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1131,7 +1132,8 @@ Do not modify directly.*
 |PRelu|*in* X:**T**<br> *in* slope:**T**<br> *out* Y:**T**|16+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
 |||9+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int8)|
 |||7+|**T** = tensor(float), tensor(float16)|
-|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *in* axes:**Tind**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* constant_value:**T**<br> *out* output:**T**<br><br>or<br><br>*in* data:**T**<br> *out* output:**T**|19+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||18+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||2+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -1199,7 +1201,9 @@ Do not modify directly.*
 |||14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||5+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|Resize|*in* X:**T**<br> *in* scales:**tensor(float)**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T1**<br> *in* roi:**T2**<br> *in* scales:**tensor(float)**<br> *in* sizes:**tensor(int64)**<br> *out* Y:**T1**|19+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||18+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||13+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||11+|**T1** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||10+|**T** = tensor(float), tensor(float16)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
index ccc2bfd872231..a10ba8099f39a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp
@@ -1,4 +1,4 @@
-﻿//---------------------------------------------------------------------------
+//---------------------------------------------------------------------------
 // Copyright (c) Microsoft Corporation. All rights reserved.
 //
 // This file is automatically generated. Please do not edit it directly.
@@ -241,6 +241,7 @@ DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_OPERATOR_ACTIVATION_SWISH", DML_OPERATOR_ACTIVATION_SWISH},
         {"DML_OPERATOR_ACTIVATION_HARD_SWISH", DML_OPERATOR_ACTIVATION_HARD_SWISH},
         {"DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2},
+        {"DML_OPERATOR_RESAMPLE3", DML_OPERATOR_RESAMPLE3},
         {"DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1},
         {"DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1},
         {"DML_OPERATOR_MULTIHEAD_ATTENTION", DML_OPERATOR_MULTIHEAD_ATTENTION},
@@ -250,6 +251,9 @@ DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_OPERATOR_MULTIHEAD_ATTENTION1", DML_OPERATOR_MULTIHEAD_ATTENTION1},
         {"DML_OPERATOR_QUANTIZE", DML_OPERATOR_QUANTIZE},
         {"DML_OPERATOR_DEQUANTIZE", DML_OPERATOR_DEQUANTIZE},
+        {"DML_OPERATOR_ROI_ALIGN_GRAD", DML_OPERATOR_ROI_ALIGN_GRAD},
+        {"DML_OPERATOR_FOLD", DML_OPERATOR_FOLD},
+        {"DML_OPERATOR_UNFOLD", DML_OPERATOR_UNFOLD},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
@@ -369,6 +373,7 @@ DML_PADDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value)
         {"DML_PADDING_MODE_EDGE", DML_PADDING_MODE_EDGE},
         {"DML_PADDING_MODE_REFLECTION", DML_PADDING_MODE_REFLECTION},
         {"DML_PADDING_MODE_SYMMETRIC", DML_PADDING_MODE_SYMMETRIC},
+        {"DML_PADDING_MODE_WRAP", DML_PADDING_MODE_WRAP},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
@@ -454,6 +459,7 @@ DML_FEATURE_LEVEL ApiTraits::StringifyHelpers::FromString(std::string_view value
         {"DML_FEATURE_LEVEL_6_1", DML_FEATURE_LEVEL_6_1},
         {"DML_FEATURE_LEVEL_6_2", DML_FEATURE_LEVEL_6_2},
         {"DML_FEATURE_LEVEL_6_3", DML_FEATURE_LEVEL_6_3},
+        {"DML_FEATURE_LEVEL_6_4", DML_FEATURE_LEVEL_6_4},
     };
     auto index = StringUtil::MapToIndex(value, mapping);
     if (!index)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
index 9a1c23093f9b9..431a3fdef5a9a 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiHelpers.h
@@ -29,6 +29,9 @@ union ActivationOperatorDescUnion
     DML_ACTIVATION_THRESHOLDED_RELU_OPERATOR_DESC thresholdedRelu;
     DML_ACTIVATION_SHRINK_OPERATOR_DESC shrink;
     DML_ACTIVATION_GELU_OPERATOR_DESC gelu;
+    DML_ACTIVATION_SWISH_OPERATOR_DESC swish;
+    DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC hardSwish;
+    DML_ELEMENT_WISE_CLIP_OPERATOR_DESC clip;
 };
 
 struct ActivationOperatorDesc
@@ -46,7 +49,7 @@ struct ActivationOperatorDesc
         case DML_OPERATOR_ACTIVATION_CELU: return { activationType, &params.celu };
         case DML_OPERATOR_ACTIVATION_HARDMAX: return { activationType, &params.hardmax };
         case DML_OPERATOR_ACTIVATION_HARDMAX1: return { activationType, &params.hardmax1 };
-        case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return { activationType, &params.sigmoid };
+        case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return { activationType, &params.hardSigmoid };
         case DML_OPERATOR_ACTIVATION_IDENTITY: return { activationType, &params.identity };
         case DML_OPERATOR_ACTIVATION_LEAKY_RELU: return { activationType, &params.leakyRelu };
         case DML_OPERATOR_ACTIVATION_LINEAR: return { activationType, &params.linear };
@@ -66,6 +69,9 @@ struct ActivationOperatorDesc
         case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return { activationType, &params.thresholdedRelu };
         case DML_OPERATOR_ACTIVATION_SHRINK: return { activationType, &params.shrink };
         case DML_OPERATOR_ACTIVATION_GELU: return { activationType, &params.gelu };
+        case DML_OPERATOR_ACTIVATION_SWISH: return { activationType, &params.swish };
+        case DML_OPERATOR_ACTIVATION_HARD_SWISH: return { activationType, &params.hardSwish };
+        case DML_OPERATOR_ELEMENT_WISE_CLIP: return { activationType, &params.clip };
         default:
             ORT_THROW_HR(E_INVALIDARG);
             return { activationType, &params.relu };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index 6a4354feb2e2e..ccd4d4c76e744 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
 
@@ -24,7 +24,7 @@ struct EnumTraits<DML_TENSOR_TYPE>
 template <>
 struct EnumTraits<DML_OPERATOR_TYPE>
 {
-    static constexpr auto ValueCount = 174;
+    static constexpr auto ValueCount = 178;
     static constexpr size_t ActivationFunctionCount = 26;
 };
 
@@ -62,7 +62,7 @@ struct EnumTraits<DML_CONVOLUTION_DIRECTION>
 template <>
 struct EnumTraits<DML_PADDING_MODE>
 {
-    static constexpr auto ValueCount = 4;
+    static constexpr auto ValueCount = 5;
 };
 
 template <>
@@ -86,7 +86,7 @@ struct EnumTraits<DML_FEATURE>
 template <>
 struct EnumTraits<DML_FEATURE_LEVEL>
 {
-    static constexpr auto ValueCount = 14;
+    static constexpr auto ValueCount = 15;
 };
 
 template <>
@@ -1023,6 +1023,12 @@ struct OperatorDescTraits<DML_RESAMPLE2_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_RESAMPLE2;
 };
 
+template <>
+struct OperatorDescTraits<DML_RESAMPLE3_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_RESAMPLE3;
+};
+
 template <>
 struct OperatorDescTraits<DML_RESAMPLE_GRAD1_OPERATOR_DESC>
 {
@@ -1053,6 +1059,18 @@ struct OperatorDescTraits<DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT;
 };
 
+template <>
+struct OperatorDescTraits<DML_FOLD_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_FOLD;
+};
+
+template <>
+struct OperatorDescTraits<DML_UNFOLD_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_UNFOLD;
+};
+
 template <>
 struct OperatorDescTraits<DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC>
 {
@@ -2073,6 +2091,12 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE2>
     using DescType = DML_RESAMPLE2_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE3>
+{
+    using DescType = DML_RESAMPLE3_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_RESAMPLE_GRAD1>
 {
@@ -2103,6 +2127,18 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGE
     using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_FOLD>
+{
+    using DescType = DML_FOLD_OPERATOR_DESC;
+};
+
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_UNFOLD>
+{
+    using DescType = DML_UNFOLD_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2>
 {
@@ -2575,6 +2611,8 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_RESAMPLE2:
         return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE2_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_RESAMPLE3:
+        return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE3_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_RESAMPLE_GRAD1:
         return std::invoke(std::forward<Visitor>(visitor), DML_RESAMPLE_GRAD1_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_DIAGONAL_MATRIX1:
@@ -2585,6 +2623,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
         return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
         return std::invoke(std::forward<Visitor>(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_FOLD:
+        return std::invoke(std::forward<Visitor>(visitor), DML_FOLD_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_UNFOLD:
+        return std::invoke(std::forward<Visitor>(visitor), DML_UNFOLD_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
         return std::invoke(std::forward<Visitor>(visitor), DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC{}, std::forward<Ts>(args)...);
     case DML_OPERATOR_MULTIHEAD_ATTENTION1:
@@ -2650,7 +2692,6 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
     }
 }
 
-
 namespace StringifyHelpers
 {
 template <typename T>
@@ -2871,6 +2912,7 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_ACTIVATION_SWISH: return "DML_OPERATOR_ACTIVATION_SWISH";
     case DML_OPERATOR_ACTIVATION_HARD_SWISH: return "DML_OPERATOR_ACTIVATION_HARD_SWISH";
     case DML_OPERATOR_RESAMPLE2: return "DML_OPERATOR_RESAMPLE2";
+    case DML_OPERATOR_RESAMPLE3: return "DML_OPERATOR_RESAMPLE3";
     case DML_OPERATOR_RESAMPLE_GRAD1: return "DML_OPERATOR_RESAMPLE_GRAD1";
     case DML_OPERATOR_DIAGONAL_MATRIX1: return "DML_OPERATOR_DIAGONAL_MATRIX1";
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return "DML_OPERATOR_MULTIHEAD_ATTENTION";
@@ -2880,6 +2922,9 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value)
     case DML_OPERATOR_MULTIHEAD_ATTENTION1: return "DML_OPERATOR_MULTIHEAD_ATTENTION1";
     case DML_OPERATOR_QUANTIZE: return "DML_OPERATOR_QUANTIZE";
     case DML_OPERATOR_DEQUANTIZE: return "DML_OPERATOR_DEQUANTIZE";
+    case DML_OPERATOR_ROI_ALIGN_GRAD: return "DML_OPERATOR_ROI_ALIGN_GRAD";
+    case DML_OPERATOR_FOLD: return "DML_OPERATOR_FOLD";
+    case DML_OPERATOR_UNFOLD: return "DML_OPERATOR_UNFOLD";
     default:
         assert(false);
         return "<unknown>";
@@ -2971,6 +3016,7 @@ inline gsl::czstring ToString(DML_PADDING_MODE value)
     case DML_PADDING_MODE_EDGE: return "DML_PADDING_MODE_EDGE";
     case DML_PADDING_MODE_REFLECTION: return "DML_PADDING_MODE_REFLECTION";
     case DML_PADDING_MODE_SYMMETRIC: return "DML_PADDING_MODE_SYMMETRIC";
+    case DML_PADDING_MODE_WRAP: return "DML_PADDING_MODE_WRAP";
     default:
         assert(false);
         return "<unknown>";
@@ -3036,6 +3082,7 @@ inline gsl::czstring ToString(DML_FEATURE_LEVEL value)
     case DML_FEATURE_LEVEL_6_1: return "DML_FEATURE_LEVEL_6_1";
     case DML_FEATURE_LEVEL_6_2: return "DML_FEATURE_LEVEL_6_2";
     case DML_FEATURE_LEVEL_6_3: return "DML_FEATURE_LEVEL_6_3";
+    case DML_FEATURE_LEVEL_6_4: return "DML_FEATURE_LEVEL_6_4";
     default:
         assert(false);
         return "<unknown>";
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index e0ccb2f51f109..14a7383e67897 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -2306,6 +2306,26 @@ constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA {
     DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_RESAMPLE3_OPERATOR_SCHEMA_FIELDS[9] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "InterpolationMode", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "RoundingDirection", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "Scales", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "InputPixelOffsets", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "Antialiased", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE3_OPERATOR_SCHEMA {
+    "DML_OPERATOR_RESAMPLE3",
+    DML_OPERATOR_RESAMPLE3,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    9,
+    DML_RESAMPLE3_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS[8] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputGradientTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputGradientTensor", false },
@@ -2414,6 +2434,44 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHE
     DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
 };
 
+constexpr DML_SCHEMA_FIELD DML_FOLD_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSizes", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_FOLD_OPERATOR_SCHEMA {
+    "DML_OPERATOR_FOLD",
+    DML_OPERATOR_FOLD,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_FOLD_OPERATOR_SCHEMA_FIELDS,
+};
+
+constexpr DML_SCHEMA_FIELD DML_UNFOLD_OPERATOR_SCHEMA_FIELDS[8] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSizes", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_UNFOLD_OPERATOR_SCHEMA {
+    "DML_OPERATOR_UNFOLD",
+    DML_OPERATOR_UNFOLD,
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    8,
+    DML_UNFOLD_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA_FIELDS[10] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", true },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
index 9decf0dce1bb2..203df0b3b8371 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h
@@ -11,4 +11,4 @@ struct NodeIndex
 
 DmlSerializedGraphDesc DeserializeDmlGraph(
     const uint8_t* flatbufferGraphDescBlob,
-    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
\ No newline at end of file
+    /*out*/ std::vector<std::unique_ptr<std::byte[]>>& rawData);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 298ecd657635e..23b5a491c7d96 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once
@@ -1422,6 +1422,20 @@ inline std::vector<OperatorField> GetFields(const DML_RESAMPLE2_OPERATOR_DESC& d
         OperatorField(&DML_RESAMPLE2_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const FLOAT*>(desc.OutputPixelOffsets), desc.DimensionCount)),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_RESAMPLE3_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.InterpolationMode))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<UINT>(desc.RoundingDirection))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const FLOAT*>(desc.Scales), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const FLOAT*>(desc.InputPixelOffsets), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const FLOAT*>(desc.OutputPixelOffsets), desc.DimensionCount)),
+        OperatorField(&DML_RESAMPLE3_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<UINT>(desc.Antialiased))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_RESAMPLE_GRAD1_OPERATOR_DESC& desc)
 {
     return {
@@ -1500,6 +1514,32 @@ inline std::vector<OperatorField> GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO
         OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_FOLD_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSizes), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_FOLD_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+    };
+}
+inline std::vector<OperatorField> GetFields(const DML_UNFOLD_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSizes), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_UNFOLD_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_DESC& desc)
 {
     return {
@@ -1912,11 +1952,14 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ROI_ALIGN_GRAD: return DML_ROI_ALIGN_GRAD_OPERATOR_SCHEMA;
     case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA;
     case DML_OPERATOR_RESAMPLE2: return DML_RESAMPLE2_OPERATOR_SCHEMA;
+    case DML_OPERATOR_RESAMPLE3: return DML_RESAMPLE3_OPERATOR_SCHEMA;
     case DML_OPERATOR_RESAMPLE_GRAD1: return DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA;
     case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA;
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA;
+    case DML_OPERATOR_FOLD: return DML_FOLD_OPERATOR_SCHEMA;
+    case DML_OPERATOR_UNFOLD: return DML_UNFOLD_OPERATOR_SCHEMA;
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2: return DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA;
     case DML_OPERATOR_MULTIHEAD_ATTENTION1: return DML_MULTIHEAD_ATTENTION1_OPERATOR_SCHEMA;
     case DML_OPERATOR_QUANTIZE: return DML_QUANTIZE_OPERATOR_SCHEMA;
@@ -2095,11 +2138,14 @@ inline const bool IsValidOperator(DML_OPERATOR_TYPE operatorType)
     case DML_OPERATOR_ROI_ALIGN_GRAD:
     case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING:
     case DML_OPERATOR_RESAMPLE2:
+    case DML_OPERATOR_RESAMPLE3:
     case DML_OPERATOR_RESAMPLE_GRAD1:
     case DML_OPERATOR_DIAGONAL_MATRIX1:
     case DML_OPERATOR_MULTIHEAD_ATTENTION:
     case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
+    case DML_OPERATOR_FOLD:
+    case DML_OPERATOR_UNFOLD:
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
     case DML_OPERATOR_MULTIHEAD_ATTENTION1:
     case DML_OPERATOR_QUANTIZE:
@@ -2695,6 +2741,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_RESAMPLE2_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_RESAMPLE2_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_RESAMPLE3:
+        return AbstractOperatorDesc(
+            &DML_RESAMPLE3_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_RESAMPLE3_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_RESAMPLE_GRAD1:
         return AbstractOperatorDesc(
             &DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA,
@@ -2715,6 +2765,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_FOLD:
+        return AbstractOperatorDesc(
+            &DML_FOLD_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_FOLD_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_UNFOLD:
+        return AbstractOperatorDesc(
+            &DML_UNFOLD_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_UNFOLD_OPERATOR_DESC*>(opDesc.Desc)));
     case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION2:
         return AbstractOperatorDesc(
             &DML_MEAN_VARIANCE_NORMALIZATION2_OPERATOR_SCHEMA,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
index a94bb67b68d36..5ea0d470b20ce 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h
@@ -1,21 +1,21 @@
-﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #pragma once
 
 using ApiAttributeVariant = std::variant<
-    const DML_TENSOR_DESC*, 
-    const DML_OPERATOR_DESC*, 
-    UINT, 
-    UINT64, 
-    INT, 
-    FLOAT, 
-    const UINT*, 
-    const INT*, 
-    const FLOAT*, 
-    const DML_SCALE_BIAS*, 
-    DML_SIZE_2D, 
-    DML_SCALAR_UNION, 
+    const DML_TENSOR_DESC*,
+    const DML_OPERATOR_DESC*,
+    UINT,
+    UINT64,
+    INT,
+    FLOAT,
+    const UINT*,
+    const INT*,
+    const FLOAT*,
+    const DML_SCALE_BIAS*,
+    DML_SIZE_2D,
+    DML_SCALAR_UNION,
     BOOL
     >;
 
@@ -39,20 +39,20 @@ namespace OperatorFieldTypes
 }
 
 using OperatorFieldVariant = std::variant<
-    OperatorFieldTypes::TensorDesc, 
-    OperatorFieldTypes::TensorDescArray, 
-    OperatorFieldTypes::FusedActivationOperatorDesc, 
-    OperatorFieldTypes::FusedActivationOperatorDescArray, 
-    OperatorFieldTypes::UInt, 
-    OperatorFieldTypes::UInt64, 
-    OperatorFieldTypes::Int, 
-    OperatorFieldTypes::Float, 
-    OperatorFieldTypes::UIntArray, 
-    OperatorFieldTypes::IntArray, 
-    OperatorFieldTypes::FloatArray, 
-    OperatorFieldTypes::ScaleBias, 
-    OperatorFieldTypes::Size2D, 
-    OperatorFieldTypes::ScalarUnion, 
+    OperatorFieldTypes::TensorDesc,
+    OperatorFieldTypes::TensorDescArray,
+    OperatorFieldTypes::FusedActivationOperatorDesc,
+    OperatorFieldTypes::FusedActivationOperatorDescArray,
+    OperatorFieldTypes::UInt,
+    OperatorFieldTypes::UInt64,
+    OperatorFieldTypes::Int,
+    OperatorFieldTypes::Float,
+    OperatorFieldTypes::UIntArray,
+    OperatorFieldTypes::IntArray,
+    OperatorFieldTypes::FloatArray,
+    OperatorFieldTypes::ScaleBias,
+    OperatorFieldTypes::Size2D,
+    OperatorFieldTypes::ScalarUnion,
     OperatorFieldTypes::Bool
     >;
 
@@ -126,4 +126,3 @@ class OperatorField
     const DML_SCHEMA_FIELD* m_schema;
     OperatorFieldVariant m_data;
 };
-
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
index 5256e01f86fb6..d31203308aef7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorResize.cpp
@@ -263,11 +263,6 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::string mode = kernelCreationContext.GetOptionalAttribute<std::string>(AttrName::Mode, "NEAREST");
         DML_INTERPOLATION_MODE interpolationMode = Dml::MapStringToInteropolationMode(mode);
 
-
-#if DML_TARGET_VERSION >= 0x6400
-        const int antialiased = kernelCreationContext.GetOptionalAttribute<int>(AttrName::Antialiased, 0);
-#endif
-
         // Map ONNX to DML's mode using offsets and rounding direction.
         // These offsets are in addition to the coordinate transform offsets.
         DML_AXIS_DIRECTION roundingDirection = DML_AXIS_DIRECTION_DECREASING;
@@ -307,12 +302,11 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
-#if DML_TARGET_VERSION >= 0x6400
+        DML_OPERATOR_DESC opDesc = {};
+        const int antialiased = kernelCreationContext.GetOptionalAttribute<int>(AttrName::Antialiased, 0);
+
         DML_RESAMPLE3_OPERATOR_DESC operatorDesc = {};
         operatorDesc.Antialiased = static_cast<BOOL>(antialiased);
-#else
-        DML_RESAMPLE2_OPERATOR_DESC operatorDesc = {};
-#endif
         operatorDesc.InputTensor = inputDescs.data();
         operatorDesc.OutputTensor = outputDescs.data();
         operatorDesc.InterpolationMode = interpolationMode;
@@ -321,11 +315,8 @@ class DmlOperatorResize : public DmlOperator, public ResizeHelper
         operatorDesc.DimensionCount = gsl::narrow_cast<uint32_t>(paddedScales.size());
         operatorDesc.InputPixelOffsets = inputPixelOffsets.data();
         operatorDesc.OutputPixelOffsets = outputPixelOffsets.data();
-#if DML_TARGET_VERSION >= 0x6400
-        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE3, &operatorDesc };
-#else
-        DML_OPERATOR_DESC opDesc = { DML_OPERATOR_RESAMPLE2, &operatorDesc };
-#endif
+        opDesc = { DML_OPERATOR_RESAMPLE3, &operatorDesc };
+
         SetDmlOperatorDesc(opDesc, kernelCreationContext);
     }
 };
@@ -368,10 +359,8 @@ void CALLBACK QueryResize(IMLOperatorSupportQueryContextPrivate* context, bool*
 DML_OP_DEFINE_CREATION_FUNCTION(Resize10, VersionedKernel<DmlOperatorResize, 10>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize11, VersionedKernel<DmlOperatorResize, 11>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize13, VersionedKernel<DmlOperatorResize, 13>);
-#if DML_TARGET_VERSION >= 0x6400
 DML_OP_DEFINE_CREATION_FUNCTION(Resize18, VersionedKernel<DmlOperatorResize, 18>);
 DML_OP_DEFINE_CREATION_FUNCTION(Resize19, VersionedKernel<DmlOperatorResize, 19>);
-#endif
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample7, VersionedKernel<DmlOperatorResize, 7>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample9, VersionedKernel<DmlOperatorResize, 9>);
 DML_OP_DEFINE_CREATION_FUNCTION(Upsample10, VersionedKernel<DmlOperatorResize, 10>);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 3a7cf28ef903e..deed62901dfb0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -852,7 +852,7 @@ namespace OperatorHelper
             {
                 ML_CHECK_VALID_ARGUMENT(outputShape[C] == gsl::narrow_cast<int>(m_outputShapes[0].GetShape()[C]),
                     "Output channel must be equivalent to filter channel.");
-            } 
+            }
 
             for (size_t i = 0; i < m_kernel.spatialDimensionCount; ++i)
             {
@@ -1857,14 +1857,13 @@ namespace OperatorHelper
         DowncastDimensions(gsl::span(shapeData), /*out*/ m_blockShape);
 
         const uint32_t dimCount = gsl::narrow_cast<uint32_t>(m_blockShape.size());
-        m_dilations = {dimCount, 1};
-        m_pads = {dimCount * 2, 0};
-        m_strides = {dimCount, 1};
+        m_dilations.assign(dimCount, 1);
+        m_pads.assign(dimCount, 0);
+        m_strides.assign(dimCount, 1);
 
         if (kernelInformation.HasAttribute(AttrName::Dilations, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Dilations);
-            m_dilations.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_dilations);
             ML_CHECK_VALID_ARGUMENT(m_dilations.size() == dimCount);
         }
@@ -1872,7 +1871,6 @@ namespace OperatorHelper
         if (kernelInformation.HasAttribute(AttrName::Pads, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Pads);
-            m_pads.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_pads);
             ML_CHECK_VALID_ARGUMENT(m_pads.size() == dimCount * 2);
         }
@@ -1880,7 +1878,6 @@ namespace OperatorHelper
         if (kernelInformation.HasAttribute(AttrName::Strides, MLOperatorAttributeType::IntArray))
         {
             shapeData = kernelInformation.GetAttributes().GetOptionalAttributeVectorInt32(AttrName::Strides);
-            m_strides.resize(shapeData.size());
             DowncastDimensions(gsl::span(shapeData), /*out*/ m_strides);
             ML_CHECK_VALID_ARGUMENT(m_strides.size() == dimCount);
         }
diff --git a/packages.config b/packages.config
index 3f3e4f5298881..f69e5b4f27956 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 88d1cebc84f8d..60d1884a9591f 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.14.1"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.15.0"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")

From 0f1f3b7705ddc2fe4f371f78a8a8b6a0428a68de Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Tue, 23 Jul 2024 20:21:55 +1000
Subject: [PATCH 34/35] CoreML: ML Program Slice (#21433)

### Description
<!-- Describe your changes. -->
Add support for Slice


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
High priority models.
---
 .../coreml/builders/impl/builder_utils.cc     |  10 +-
 .../coreml/builders/impl/builder_utils.h      |   7 +-
 .../coreml/builders/impl/slice_op_builder.cc  | 130 +++++++++++++-----
 .../coreml/builders/model_builder.cc          |   7 +
 .../providers/coreml/builders/model_builder.h |   7 +
 .../providers/cpu/tensor/slice_op.test.cc     |   2 +-
 .../apple/coreml_supported_mlprogram_ops.md   |   1 +
 7 files changed, 127 insertions(+), 37 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 2fcf9a1d7d9ba..ebb3f97895f06 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -164,6 +164,7 @@ void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_
 void SetTensorTypeInfo(MILSpec::TensorType& tensor_type, MILSpec::DataType data_type,
                        const ONNX_NAMESPACE::TensorShapeProto* shape, bool convert_scalar = false) {
   tensor_type.set_datatype(data_type);
+
   if (shape) {
     auto rank = shape->dim_size();
     if (convert_scalar && rank == 0) {
@@ -313,7 +314,8 @@ void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std:
   (*op.mutable_inputs())[input_name] = std::move(arg);
 }
 
-void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output) {
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
+                        std::optional<int32_t> override_element_type) {
   auto& outputs = *op.mutable_outputs();
   auto& output_arg = *outputs.Add();
   output_arg.set_name(output.Name());
@@ -321,8 +323,10 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
   MILSpec::ValueType& value = *output_arg.mutable_type();
   MILSpec::TensorType& tensor_type = *value.mutable_tensortype();
 
-  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(output.TypeAsProto()->tensor_type().elem_type()),
-                    output.Shape(), /*convert_scalar*/ true);
+  auto elem_type = override_element_type ? *override_element_type
+                                         : output.TypeAsProto()->tensor_type().elem_type();
+
+  SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(elem_type), output.Shape(), /*convert_scalar*/ true);
 }
 
 void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 97fb83b6dc482..f012e6af0d718 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -134,7 +134,12 @@ void AddOperationInput(COREML_SPEC::MILSpec::Operation& op,
 /// </summary>
 /// <param name="op">Operation to update.</param>
 /// <param name="output">NodeArg with details of output to add.</param>
-void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output);
+/// <param name="override_element_type">
+///   Override the element type. Only set to handle cases where we believe the data at runtime will be int32 but
+///   the original ONNX node has type int64.
+/// </param>
+void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output,
+                        std::optional<int32_t> override_element_type = std::nullopt);
 
 /// <summary>
 /// Add pad_type and pad values.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 39bfbfe5bba1f..51fc3f2c11c73 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -4,6 +4,7 @@
 #include "core/optimizer/initializer.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -28,12 +29,14 @@ class SliceOpBuilder : public BaseOpBuilder {
 
   bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& builder_params,
                          const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 namespace {
-Status PrepareSliceComputeMetadataFromConstantInitializers(const Node& slice_node,
-                                                           const GraphViewer& graph_viewer,
-                                                           SliceOp::PrepareForComputeMetadata& compute_metadata) {
+Status PrepareSliceComputeMetadata(const Node& slice_node,
+                                   const GraphViewer& graph_viewer,
+                                   SliceOp::PrepareForComputeMetadata& compute_metadata) {
   // TODO largely copied from nnapi::SliceOpBuilder::AddToModelBuilderImpl. put it somewhere where it can be reused?
 
   const auto input_defs = slice_node.InputDefs();
@@ -114,55 +117,113 @@ void SliceOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const No
 
 Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                              const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& output_defs = node.OutputDefs();
+
   std::vector<int64_t> data_shape;
   ORT_RETURN_IF_NOT(GetStaticShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape.");
+  auto rank = data_shape.size();
 
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
-  ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadataFromConstantInitializers(node, model_builder.GetGraphViewer(),
-                                                                          compute_metadata));
+  ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata));
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;  // NOLINT
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index
+
+    const InlinedVector<bool> begin_mask_values(rank, false);
+    InlinedVector<bool> end_mask_values(rank, false);
+
+    // Special case - stepping backwards up to and including the first index in the dimension.
+    // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
+    // so use endmasks to specify the rest of the dimension instead.
+    for (size_t i = 0; i < rank; ++i) {
+      if (compute_metadata.steps_[i] < 0 && compute_metadata.ends_[i] == -1) {
+        end_mask_values[i] = true;
+      }
+    }
 
-  auto layer = model_builder.CreateNNLayer(node);
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
-  auto* slice_static = layer->mutable_slicestatic();
+    // Only int32 and float are supported by CoreML slice_by_index.
+    // We convert any int64 model input to int32 when running the CoreML model for the partition.
+    // Any other integer data created at runtime is the output from CoreML operations, and should int32 not int64.
+    // Based on that, we assume that the actual input when running will be int32, so we override the output data
+    // type to reflect this.
+    // If we were to leave it as TensorProto_DataType_INT64 the CoreML model would be invalid.
+    std::optional<int32_t> output_datatype;
 
-  for (size_t i = 0; i < compute_metadata.starts_.size(); ++i) {
-    const auto step = compute_metadata.steps_[i],
-               start = compute_metadata.starts_[i],
-               end = compute_metadata.ends_[i];
+    int32_t input_type;
+    ORT_RETURN_IF_NOT(GetType(*node.InputDefs()[0], input_type, logger), "Failed to get input type");
 
-    slice_static->add_beginids(start);
-    slice_static->add_beginmasks(false);
+    if (input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+      output_datatype = ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    }
 
-    if (step < 0 && end == -1) {
-      // Special case - stepping backwards up to and including the first index in the dimension.
-      // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
-      // so use endmasks to specify the rest of the dimension instead.
-      slice_static->add_endids(-1);  // ignored
-      slice_static->add_endmasks(true);
-    } else {
-      slice_static->add_endids(end);
-      slice_static->add_endmasks(false);
+    auto op = model_builder.CreateOperation(node, "slice_by_index");
+
+    auto begin = model_builder.AddConstant(op->type(), "begin", AsSpan(compute_metadata.starts_));
+    auto end = model_builder.AddConstant(op->type(), "end", AsSpan(compute_metadata.ends_));
+    auto stride = model_builder.AddConstant(op->type(), "stride", AsSpan(compute_metadata.steps_));
+    auto begin_mask = model_builder.AddConstant(op->type(), "begin_mask", AsSpan(begin_mask_values));
+    auto end_mask = model_builder.AddConstant(op->type(), "end_mask", AsSpan(end_mask_values));
+
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    AddOperationInput(*op, "begin", begin);
+    AddOperationInput(*op, "end", end);
+    AddOperationInput(*op, "stride", stride);
+    AddOperationInput(*op, "begin_mask", begin_mask);
+    AddOperationInput(*op, "end_mask", end_mask);
+
+    AddOperationOutput(*op, *output_defs[0], output_datatype);
+
+    model_builder.AddOperation(std::move(op));
+
+  } else  // NOLINT
+#endif    // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    auto layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = output_defs[0]->Name();
+    auto* slice_static = layer->mutable_slicestatic();
+
+    for (size_t i = 0; i < rank; ++i) {
+      const auto step = compute_metadata.steps_[i],
+                 start = compute_metadata.starts_[i],
+                 end = compute_metadata.ends_[i];
+
+      slice_static->add_beginids(start);
+      slice_static->add_beginmasks(false);
+
+      if (step < 0 && end == -1) {
+        // Special case - stepping backwards up to and including the first index in the dimension.
+        // In ONNX Slice, we use end <= -(rank + 1) to represent this. In CoreML, setting endids like that doesn't work,
+        // so use endmasks to specify the rest of the dimension instead.
+        slice_static->add_endids(-1);  // ignored
+        slice_static->add_endmasks(true);
+      } else {
+        slice_static->add_endids(end);
+        slice_static->add_endmasks(false);
+      }
+
+      slice_static->add_strides(step);
     }
 
-    slice_static->add_strides(step);
+    model_builder.AddLayer(std::move(layer));
   }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
 bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& /*input_params*/,
                                             const logging::Logger& logger) const {
   int32_t input_type;
-  if (!GetType(*node.InputDefs()[0], input_type, logger))
+  if (!GetType(*node.InputDefs()[0], input_type, logger)) {
     return false;
+  }
 
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
       input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType()
-                          << "] Input type: [" << input_type
-                          << "] is not supported for now";
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
     return false;
   }
 
@@ -197,9 +258,14 @@ bool SliceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
   }
 
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
-  ORT_THROW_IF_ERROR(PrepareSliceComputeMetadataFromConstantInitializers(node, builder_params.graph_viewer,
-                                                                         compute_metadata));
+  auto status = PrepareSliceComputeMetadata(node, builder_params.graph_viewer, compute_metadata);
+  if (status != Status::OK()) {
+    LOGS(logger, VERBOSE) << "PrepareSliceComputeMetadata failed:" << status.ErrorMessage();
+    return false;
+  }
+
   if (!ValidateSliceComputeMetadataForCoreML(compute_metadata, logger)) {
+    // error logged in ValidateSliceComputeMetadataForCoreML
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index eec0fcce51dbc..9668bfcd09adf 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -839,6 +839,13 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     if (is_input) {
       // the model inputs need to be wired up as args to the 'main' function.
       auto tensor_value_type = CreateNamedTensorValueType(node_arg, /*convert_scalar*/ true);
+
+      // we need to convert int64 to int32 here as well
+      if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+        tensor_value_type.mutable_type()->mutable_tensortype()->set_datatype(
+            OnnxDataTypeToMILSpec(ONNX_NAMESPACE::TensorProto_DataType_INT32));
+      }
+
       tensor_value_type.set_name(name);
 
       mlprogram_main_fn_->mutable_inputs()->Add(std::move(tensor_value_type));
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 385588dbfdcb8..bb791fb902908 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -121,6 +121,13 @@ class ModelBuilder {
     return AddConstant(op_type, value_type, AsSpan(value), shape);
   }
 
+  // helper to convert a span of non-const data to const
+  template <typename T>
+  std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<T> value,
+                               std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
+    return AddConstant(op_type, value_type, gsl::span<const T>(value), shape);
+  }
+
   /// <summary>
   /// Add a scalar value as a 'const' operation. See AddConstant for details.
   /// </summary>
diff --git a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
index af54ae96ef86b..83b308b57f26b 100644
--- a/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/slice_op.test.cc
@@ -90,7 +90,7 @@ void RunSliceTest(const std::vector<int64_t>& input_dims,
 
   run_test(false);
 
-  // NNAPI EP requires the starts/ends/axes/steps be initializers
+  // EPs like NNAPI and CoreML require the starts/ends/axes/steps be initializers
   run_test(true);
 }
 
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 3b3790ba06599..c33184686c932 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -18,6 +18,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.|
+|ai.onnx.Slice|starts/ends/axes/steps must be constant initializers.|
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||

From f70215d4e6af2a1b5a0cc232460e2f86125b055d Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 23 Jul 2024 10:00:36 -0700
Subject: [PATCH 35/35] Update C++ dependencies (#21410)

1. Update google benchmark from 1.8.3 to 1.8.5
2. Update google test from commit in main branch to tag 1.15.0
3. Update pybind11 from 2.12.0 to 2.13.1
4. Update pytorch cpuinfo to include the support for Arm Neoverse V2,
Cortex X4, A720 and A520.
5. Update re2 from 2024-05-01 to 2024-07-02
6. Update cmake to 3.30.1
7. Update Linux docker images
8. Fix a warning in test/perftest/ort_test_session.cc:826:37: error:
implicit conversion loses integer precision: 'streamoff' (aka 'long
long') to 'const std::streamsize' (aka 'const long')
[-Werror,-Wshorten-64-to-32]
---
 cgmanifests/generated/cgmanifest.json         |  10 +-
 cmake/deps.txt                                |  10 +-
 cmake/external/abseil-cpp.cmake               | 114 +++++++++++++-----
 ...2d342fd9479679d505d93a478a6f9cd50a47.patch |  12 +-
 onnxruntime/test/perftest/ort_test_session.cc |   4 +-
 tools/android_custom_build/Dockerfile         |   4 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |   2 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |   2 +-
 .../stages/java-cuda-packaging-stage.yml      |   2 +-
 .../jobs/py-linux-cuda-package-test-job.yml   |   2 +-
 .../stages/py-cuda-packaging-stage.yml        |   2 +-
 .../templates/download-deps.yml               |   4 +-
 ...Dockerfile.manylinux2_28_training_cuda12_2 |   2 +-
 .../Dockerfile.ubuntu_cuda11_8_tensorrt8_6    |   2 +-
 .../Dockerfile.ubuntu_cuda11_tensorrt10       |   2 +-
 .../Dockerfile.ubuntu_cuda12_3_tensorrt8_6    |   2 +-
 .../Dockerfile.ubuntu_cuda12_tensorrt10       |   2 +-
 .../linux/docker/Dockerfile.ubuntu_openvino   |   4 +-
 .../docker/Dockerfile.ubuntu_tensorrt_bin     |   2 +-
 .../default/cpu/scripts/install_deps.sh       |   2 +-
 .../default/cpu/scripts/install_deps.sh       |   4 +-
 .../default/cuda11/scripts/install_deps.sh    |   4 +-
 .../x86_64/default/cuda12/Dockerfile          |   2 +-
 .../default/cuda12/scripts/install_deps.sh    |   4 +-
 .../migraphx-ci-pipeline-env.Dockerfile       |   4 +-
 .../linux/docker/scripts/install_os_deps.sh   |  10 +-
 .../pai/rocm-ci-pipeline-env.Dockerfile       |   2 +-
 27 files changed, 133 insertions(+), 83 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 29eb7045fc299..66b305a6d36de 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -116,7 +116,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "344117638c8ff7e239044fd0fa7085839fc03021",
+          "commitHash": "a6ad7fbbdc2e14fab82bb8a6d27760d700198cbf",
           "repositoryUrl": "https://github.com/google/benchmark.git"
         },
         "comments": "google_benchmark"
@@ -136,7 +136,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "530d5c8c84abd2a46f38583ee817743c9b3a42b4",
+          "commitHash": "e39786088138f2749d64e9e90e0f9902daa77c40",
           "repositoryUrl": "https://github.com/google/googletest.git"
         },
         "comments": "googletest"
@@ -256,7 +256,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "3e9dfa2866941655c56877882565e7577de6fc7b",
+          "commitHash": "941f45bcb51457884fa1afd6e24a67377d70f75c",
           "repositoryUrl": "https://github.com/pybind/pybind11.git"
         },
         "comments": "pybind11"
@@ -266,7 +266,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "959002f82d7962a473d8bf301845f2af720e0aa4",
+          "commitHash": "ca678952a9a8eaa6de112d154e8e104b22f9ab3f",
           "repositoryUrl": "https://github.com/pytorch/cpuinfo.git"
         },
         "comments": "pytorch_cpuinfo"
@@ -276,7 +276,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "2b354c6ad0d0479dcff68dab23fb0d1143a482c2",
+          "commitHash": "6dcd83d60f7944926bfd308cc13979fc53dd69ca",
           "repositoryUrl": "https://github.com/google/re2.git"
         },
         "comments": "re2"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 72469603a0889..9d206b6bb3aeb 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -26,9 +26,9 @@ eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea0
 flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
 fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
-google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d
+google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177
 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
-googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
+googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349
 googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c
 microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
@@ -48,9 +48,9 @@ protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/downlo
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip;07a0aa91dd9bf86f31b95497e00f31d8a261a4bd
-pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.zip;8482f57ed55c7b100672815a311d5450858723fb
-pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip;85da3caa60eb2b148613b443fbc2bfdc30689965
-re2;https://github.com/google/re2/archive/refs/tags/2024-05-01.tar.gz;206cfee5ee0b4c6844680ba66275e9e8faa77405
+pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.1.zip;9255d5c8568debcc329dd42ed8f410ee139ac7b1
+pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/ca678952a9a8eaa6de112d154e8e104b22f9ab3f.zip;138bf57d2a110935330d1048dce6d7b82d17d377
+re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.5.0.zip;ae038931b9fc2c416c17d9cda91d9706b343f56d
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 6c5c4b21f5c58..3223724693a49 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -50,46 +50,96 @@ endif()
 # TODO: since multiple ORT's dependencies depend on Abseil, the list below would vary from version to version.
 # We'd better to not manually manage the list.
 set(ABSEIL_LIBS
-absl::city
+absl::absl_log
+absl::log_internal_log_impl
+absl::log_internal_strip
+absl::log_internal_message
+absl::log_internal_format
+absl::synchronization
+absl::str_format
 absl::flags
-absl::flat_hash_map
-absl::flat_hash_set
+absl::log_internal_globals
+absl::kernel_timeout_internal
+absl::str_format_internal
 absl::hash
-absl::inlined_vector
-absl::low_level_hash
-absl::node_hash_map
-absl::node_hash_set
+absl::log_internal_append_truncated
+absl::absl_vlog_is_on
+absl::flags_commandlineflag
+absl::time
+absl::symbolize
+absl::graphcycles_internal
+absl::log_internal_conditions
+absl::strings
+absl::malloc_internal
+absl::demangle_internal
 absl::optional
-absl::raw_hash_set
+absl::stacktrace
+absl::base
+absl::demangle_rust
+absl::bad_optional_access
+absl::strings_internal
+absl::debugging_internal
+absl::int128
+absl::spinlock_wait
+absl::decode_rust_punycode
 absl::raw_logging_internal
-absl::str_format
-absl::str_format_internal
+absl::flat_hash_set
+absl::flat_hash_map
+absl::node_hash_map
+absl::node_hash_set
+absl::compare
+absl::base_internal
+absl::nullability
+absl::bounded_utf8_length_sequence
+absl::log_severity
+absl::type_traits
+absl::atomic_hook
 absl::bits
-absl::fixed_array
+absl::flags_commandlineflag_internal
+absl::hash_container_defaults
 absl::numeric_representation
-absl::utility
-absl::type_traits
-absl::string_view
+absl::node_slot_policy
 absl::core_headers
-absl::nullability
+absl::dynamic_annotations
+absl::utf8_for_code_point
+absl::errno_saver
+absl::absl_check
+absl::hash_function_defaults
+absl::function_ref
+absl::city
+absl::low_level_hash
+absl::fixed_array
+absl::variant
+absl::meta
+absl::log_internal_voidify
+absl::log_sink
+absl::log_internal_log_sink_set
+absl::log_sink_registry
+absl::log_entry
+absl::log_globals
+absl::log_internal_nullguard
+absl::examine_stack
+absl::inlined_vector
+absl::log_internal_proto
+absl::strerror
+absl::log_internal_config
+absl::raw_hash_map
+absl::raw_hash_set
+absl::container_memory
+absl::algorithm_container
 absl::span
-absl::config
-absl::synchronization
-absl::base
+absl::log_internal_nullstream
+absl::vlog_config_internal
+absl::flags_reflection
+absl::flags_internal
+absl::flags_config
+absl::fast_type_id
+absl::utility
+absl::time_zone
 absl::civil_time
-absl::debugging_internal
-absl::demangle_internal
-absl::graphcycles_internal
-absl::int128
-absl::kernel_timeout_internal
-absl::log_severity
-absl::malloc_internal
-absl::spinlock_wait
-absl::stacktrace
 absl::string_view
-absl::strings
-absl::strings_internal
-absl::symbolize
 absl::throw_delegate
-absl::time
-absl::time_zone)
\ No newline at end of file
+absl::memory
+absl::charset
+absl::endian
+absl::config)
\ No newline at end of file
diff --git a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
index afb19a45ce0f4..dc8580207945c 100644
--- a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
+++ b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch
@@ -1,22 +1,22 @@
 diff --git a/include/cpuinfo.h b/include/cpuinfo.h
-index c46b65e..8b83a64 100644
+index 03f2776..eaf6497 100644
 --- a/include/cpuinfo.h
 +++ b/include/cpuinfo.h
 @@ -18,7 +18,7 @@
- 	#define CPUINFO_ARCH_X86 1
+ #define CPUINFO_ARCH_X86 1
  #endif
  
 -#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
-+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
- 	#define CPUINFO_ARCH_X86_64 1
++#if defined(__x86_64__) || defined(__x86_64) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+ #define CPUINFO_ARCH_X86_64 1
  #endif
  
 @@ -26,7 +26,7 @@
- 	#define CPUINFO_ARCH_ARM 1
+ #define CPUINFO_ARCH_ARM 1
  #endif
  
 -#if defined(__aarch64__) || defined(_M_ARM64)
 +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
- 	#define CPUINFO_ARCH_ARM64 1
+ #define CPUINFO_ARCH_ARM64 1
  #endif
  
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 0e4f0d0cad3f4..72b5da7aaec9b 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -823,10 +823,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     std::ifstream file(performance_test_config.model_info.model_file_path.c_str(),
                        std::ios::binary | std::ios::in | std::ios::ate);
     if (file.is_open()) {
-      const std::streamsize fsize = file.tellg();
+      const std::streampos fsize = file.tellg();
       file.seekg(0, std::ios_base::beg);
       std::vector<char> model_bytes(narrow<size_t>(fsize));
-      file.read(model_bytes.data(), fsize);
+      file.read(model_bytes.data(), narrow<std::streamsize>(fsize));
       session_ = Ort::Session(env, model_bytes.data(), model_bytes.size(), session_options);
     } else {
       ORT_THROW("Model file could not be opened.\n");
diff --git a/tools/android_custom_build/Dockerfile b/tools/android_custom_build/Dockerfile
index 754a6633b0c62..d69fea7c8c1c5 100644
--- a/tools/android_custom_build/Dockerfile
+++ b/tools/android_custom_build/Dockerfile
@@ -24,9 +24,9 @@ RUN apt-get update && apt-get install --yes --no-install-recommends \
   unzip lsb-release
 
 # cmake
-RUN CMAKE_VERSION=3.27.3 && \
+RUN CMAKE_VERSION=3.30.1 && \
   aria2c -q -d /tmp -o cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz \
-  --checksum=sha-256=62e7819fe0867658b6ea765a711686d637dce76cdf6eb0a6b0f1b879e0344fa7 \
+  --checksum=sha-256=ac31f077ef3378641fa25a3cb980d21b2f083982d3149a8f2eb9154f2b53696b \
   https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
   tar -zxf /tmp/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz --strip=1 -C /usr
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 8890a9c4bf56b..30f56f4b18aec 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -50,7 +50,7 @@ variables:
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 3f9707ff50519..78e3b166995ec 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -40,7 +40,7 @@ variables:
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: 10.2.0.19-1.cuda11.8
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index 22264fc670cf7..430dc89b5b097 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -141,7 +141,7 @@ stages:
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
         value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
     timeoutInMinutes: 60
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 7dfafeb67acf8..ad5e09e5a2f00 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -46,7 +46,7 @@ jobs:
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: 10.2.0.19-1.cuda11.8
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index dcd681bd4b915..098da375423d6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -81,5 +81,5 @@ stages:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
             trt_version: 10.2.0.19-1.cuda11.8
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
             trt_version: 10.2.0.19-1.cuda12.5
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index cf350704f8356..bf11730c2ce28 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.165
+      version: 1.0.167
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.165
+      version: 1.0.167
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
index 6886600417c8e..082446515a367 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240719.1
 
 ARG PYTHON_VERSION=3.9
 ARG TORCH_VERSION=2.1.0
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
index f1ffba3b3e1c9..dfc057b129f91 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index 0bd56a1a5873f..e24d225fa23f9 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -10,7 +10,7 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
index 9493480784e81..f63112039fe8e 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 7f66943dd8745..da53b64199299 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index dbd2076041b94..4382e12a1cd6c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -34,8 +34,8 @@ RUN wget "https://github.com/intel/compute-runtime/releases/download/21.48.21782
     sudo dpkg -i *.deb && rm -rf *.deb
 
 RUN mkdir -p /opt/cmake/bin && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-x86_64.tar.gz && \
-    tar -xf cmake-3.27.3-linux-x86_64.tar.gz --strip 1 -C /opt/cmake && rm -rf /cmake-3.27.3-linux-x86_64.tar.gz && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-x86_64.tar.gz && \
+    tar -xf cmake-3.30.1-linux-x86_64.tar.gz --strip 1 -C /opt/cmake && rm -rf /cmake-3.30.1-linux-x86_64.tar.gz && \
     ln -sf /opt/cmake/bin/* /usr/bin
 
 ARG BUILD_UID=1000
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index 0281c1c8fef25..e8d8dc0a64feb 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -10,7 +10,7 @@ FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
 
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.30.1-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
index 7598ab0a7a536..9c3017240f77f 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
@@ -40,7 +40,7 @@ cd /tmp/src
 
 CPU_ARCH=$(uname -m)
 echo "Installing cmake"
-GetFile "https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz"
+GetFile "https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz"
 tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
index 3b05c6787ca3e..fbbf4cf71157c 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-`uname -m`.tar.gz /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
index 3c88c516bee4e..fbbf4cf71157c 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-`uname -m`.tar.gz /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index ef69235ce5c14..245164f93fe43 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20240610.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20240719.1
 ARG TRT_VERSION
 
 #Install TensorRT only if TRT_VERSION is not empty
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
index 3c88c516bee4e..fbbf4cf71157c 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-linux-`uname -m`.tar.gz /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.30.1-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index 6c71631368822..98ea5e119c319 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -45,10 +45,10 @@ ENV LANG C.UTF-8
 WORKDIR /stage
 
 # Cmake
-ENV CMAKE_VERSION=3.27.3
+ENV CMAKE_VERSION=3.30.1
 RUN cd /usr/local && \
     wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz && \
-    tar -zxf /usr/local/cmake-3.27.3-Linux-x86_64.tar.gz --strip=1 -C /usr
+    tar -zxf /usr/local/cmake-3.30.1-Linux-x86_64.tar.gz --strip=1 -C /usr
 
 # ccache
 RUN mkdir -p /tmp/ccache && \
diff --git a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
index 3e872d17504a1..7f3160371aa24 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh
@@ -71,18 +71,18 @@ if [[ $SYS_LONG_BIT = "64" && "$GLIBC_VERSION" -gt "9" ]]; then
   tar --strip 1 -xf /tmp/azcopy/azcopy.tar.gz -C /tmp/azcopy
   cp /tmp/azcopy/azcopy /usr/bin
   echo "Installing cmake"
-  GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-Linux-x86_64.tar.gz /tmp/src/cmake-3.27.3-Linux-x86_64.tar.gz
-  tar -zxf /tmp/src/cmake-3.27.3-Linux-x86_64.tar.gz --strip=1 -C /usr
+  GetFile https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1-Linux-x86_64.tar.gz /tmp/src/cmake-3.30.1-Linux-x86_64.tar.gz
+  tar -zxf /tmp/src/cmake-3.30.1-Linux-x86_64.tar.gz --strip=1 -C /usr
   echo "Installing Node.js"
   # The EOL for nodejs v18.17.1 LTS is April 2025
   GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-x64.tar.xz /tmp/src/node-v18.17.1-linux-x64.tar.xz
   tar -xf /tmp/src/node-v18.17.1-linux-x64.tar.xz --strip=1 -C /usr
 else
   echo "Installing cmake"
-  GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz /tmp/src/cmake-3.27.3.tar.gz
-  tar -xf /tmp/src/cmake-3.27.3.tar.gz -C /tmp/src
+  GetFile https://github.com/Kitware/CMake/releases/download/v3.30.1/cmake-3.30.1.tar.gz /tmp/src/cmake-3.30.1.tar.gz
+  tar -xf /tmp/src/cmake-3.30.1.tar.gz -C /tmp/src
   pushd .
-  cd /tmp/src/cmake-3.27.3
+  cd /tmp/src/cmake-3.30.1
   ./bootstrap --prefix=/usr --parallel=$(getconf _NPROCESSORS_ONLN) --system-bzip2 --system-curl --system-zlib --system-expat
   make -j$(getconf _NPROCESSORS_ONLN)
   make install
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index bf21a65314985..9272f6e627a13 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -41,7 +41,7 @@ ENV LANG C.UTF-8
 WORKDIR /stage
 
 # CMake
-ENV CMAKE_VERSION=3.27.3
+ENV CMAKE_VERSION=3.30.1
 RUN cd /usr/local && \
     wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
 ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}