diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index daacd221caa93..5555fa692eae8 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -551,7 +551,7 @@ if(NOT WIN32 AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android")
 endif()
 
 find_package(Patch)
-if (WIN32 AND NOT Patch_FOUND)
+if (CMAKE_HOST_WIN32 AND NOT Patch_FOUND)
     # work around CI machines missing patch from the git install by falling back to the binary in this repo.
     # replicate what happens in https://github.com/Kitware/CMake/blob/master/Modules/FindPatch.cmake but without
     # the hardcoded suffixes in the path to the patch binary.
@@ -1040,7 +1040,7 @@ function(onnxruntime_set_compile_flags target_name)
       # Enable warning
       target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wall>")
       target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wextra>")
-      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang")
         #external/protobuf/src/google/protobuf/arena.h:445:18: error: unused parameter 'p'
         target_compile_options(${target_name} PRIVATE "-Wno-unused-parameter")
       endif()
@@ -1140,6 +1140,13 @@ endfunction()
 function(onnxruntime_add_shared_library target_name)
   add_library(${target_name} SHARED ${ARGN})
   onnxruntime_configure_target(${target_name})
+  if(WIN32)
+        target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\")
+  endif()
 endfunction()
 
 function(onnxruntime_add_static_library target_name)
@@ -1154,6 +1161,13 @@ function(onnxruntime_add_shared_library_module target_name)
   else()
     #On Windows, this target shouldn't generate an import lib, but I don't know how to disable it.
     add_library(${target_name} MODULE ${ARGN})
+    if(WIN32)
+        target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
+        target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\")
+    endif()
   endif()
 
   onnxruntime_configure_target(${target_name})
@@ -1488,9 +1502,6 @@ if (onnxruntime_USE_CUDA)
 endif()
 
 if (onnxruntime_USE_MIGRAPHX)
-  if (WIN32)
-    message(FATAL_ERROR "MIGraphX does not support build in Windows!")
-  endif()
   set(AMD_MIGRAPHX_HOME ${onnxruntime_MIGRAPHX_HOME})
 endif()
 
@@ -1560,7 +1571,7 @@ if (UNIX OR onnxruntime_USE_NCCL)
   if (onnxruntime_USE_NCCL)
     if (onnxruntime_USE_CUDA)
       set(NCCL_LIBNAME "nccl")
-    elseif (onnxruntime_USE_ROCM)
+    elseif (onnxruntime_USE_ROCM OR onnxruntime_USE_MIGRAPHX)
       set(NCCL_LIBNAME "rccl")
     endif()
     find_path(NCCL_INCLUDE_DIR
@@ -1639,6 +1650,14 @@ set(VERSION_MINOR_PART   0 CACHE STRING "Second part of numeric file/product ver
 set(VERSION_BUILD_PART       0 CACHE STRING "Third part of numeric file/product version.")
 set(VERSION_PRIVATE_PART     0 CACHE STRING "Fourth part of numeric file/product version.")
 set(VERSION_STRING       "Internal Build" CACHE STRING "String representation of file/product version.")
+if(VERSION_MAJOR_PART STREQUAL "0" AND VERSION_MINOR_PART STREQUAL "0" AND VERSION_BUILD_PART STREQUAL "0" AND VERSION_PRIVATE_PART STREQUAL "0")
+    string(REPLACE "." ";"  ORT_VERSION_STRING_LIST ${ORT_VERSION})
+    list(GET ORT_VERSION_STRING_LIST 0 VERSION_MAJOR_PART)
+    list(GET ORT_VERSION_STRING_LIST 1 VERSION_MINOR_PART)
+    list(GET ORT_VERSION_STRING_LIST 2 VERSION_BUILD_PART)
+    set(VERSION_STRING ORT_VERSION)
+endif()
+
 
 if (WIN32)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SYS_PATH_LIB})
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index a29f89ea8289a..5eb9cf2fdce0f 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -47,6 +47,9 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(gtest_disable_pthreads ON)
   endif()
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE)
+  endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
   if (IOS OR ANDROID)
     # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index ec98047750a91..21ae0947f3788 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -57,6 +57,7 @@ foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
   list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt")
 endforeach()
 
+if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
 add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
   COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py"
     --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}"
@@ -66,6 +67,7 @@ add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_s
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+endif()
 if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
@@ -95,30 +97,33 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
     FRAMEWORK TRUE
     FRAMEWORK_VERSION A
     MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH}
-    SOVERSION ${ORT_VERSION}
     # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
   )
 else()
-  onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc)
+  else()
+    onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
+  endif()
   if (onnxruntime_USE_CUDA)
     set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
   endif()
 endif()
 
-add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES})
+else()
+  add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+endif()
 target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
-target_compile_definitions(onnxruntime PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-target_compile_definitions(onnxruntime PRIVATE VER_STRING=\"${VERSION_STRING}\")
+
 target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
 
 if(UNIX)
   if (APPLE)
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker -dead_strip")
-  else()
+  elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker --version-script=${SYMBOL_FILE} -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
   endif()
 else()
@@ -130,7 +135,6 @@ if (NOT WIN32)
     set(ONNXRUNTIME_SO_LINK_FLAG " -Wl,-exported_symbols_list,${SYMBOL_FILE}")
     if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
       set_target_properties(onnxruntime PROPERTIES
-        SOVERSION ${ORT_VERSION}
         MACOSX_RPATH TRUE
         INSTALL_RPATH_USE_LINK_PATH FALSE
         BUILD_WITH_INSTALL_NAME_DIR TRUE
@@ -138,7 +142,7 @@ if (NOT WIN32)
     else()
         set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path")
     endif()
-  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
   endif()
 endif()
@@ -206,6 +210,10 @@ set(onnxruntime_INTERNAL_LIBRARIES
   onnxruntime_flatbuffers
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  list(APPEND onnxruntime_INTERNAL_LIBRARIES  iconv)
+endif()
+
 if (onnxruntime_USE_EXTENSIONS)
   list(APPEND onnxruntime_INTERNAL_LIBRARIES
     onnxruntime_extensions
@@ -222,13 +230,30 @@ target_link_libraries(onnxruntime PRIVATE
 )
 
 set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
-set_target_properties(onnxruntime PROPERTIES
-  PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
-  LINK_DEPENDS ${SYMBOL_FILE}
-  VERSION ${ORT_VERSION}
-  FOLDER "ONNXRuntime"
-)
-
+#See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html
+if(NOT APPLE AND NOT WIN32)
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  else()
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      LINK_DEPENDS ${SYMBOL_FILE}
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  endif()
+else()
+  # Omit the SOVERSION setting in Windows/macOS/iOS/.. build
+  set_target_properties(onnxruntime PROPERTIES
+    PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+    LINK_DEPENDS ${SYMBOL_FILE}
+    VERSION ${ORT_VERSION}
+    FOLDER "ONNXRuntime")
+endif()
 install(TARGETS onnxruntime
         EXPORT ${PROJECT_NAME}Targets
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index c9bf2ac5c3dc6..43d16abd8fbae 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -108,7 +108,7 @@ add_dependencies(onnxruntime_framework ${onnxruntime_EXTERNAL_DEPENDENCIES})
 # For the shared onnxruntime library, this is set in onnxruntime.cmake through CMAKE_SHARED_LINKER_FLAGS
 # But our test files don't use the shared library so this must be set for them.
 # For Win32 it generates an absolute path for shared providers based on the location of the executable/onnxruntime.dll
-if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
 endif()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index df6553e383620..66f4aea606ef5 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -427,12 +427,24 @@ else()
           )
           if(COMPILES_P10)
             check_cxx_source_compiles("
+              #ifdef _AIX
+              #define POWER_10       0x40000
+              #define POWER_10_ANDUP (POWER_10)
+              #include <sys/systemcfg.h>
+              #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+              int main() {
+                bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
+                return 0;
+              }
+              #else
               #include <sys/auxv.h>
               int main() {
                 unsigned long hwcap2 = getauxval(AT_HWCAP2);
                 bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
                 return 0;
-              }"
+              }
+              }
+              #endif"
               HAS_P10_RUNTIME
             )
             if (HAS_P10_RUNTIME)
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index b211c02f712bd..d2afe19f36691 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -236,11 +236,6 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_target_properties(onnxruntime_providers_shared PROPERTIES FOLDER "ONNXRuntime")
   set_target_properties(onnxruntime_providers_shared PROPERTIES LINKER_LANGUAGE CXX)
 
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-  target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_STRING=\"${VERSION_STRING}\")
   target_compile_definitions(onnxruntime_providers_shared PRIVATE FILE_NAME=\"onnxruntime_providers_shared.dll\")
 
 
@@ -252,7 +247,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
-  set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
   set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)
diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
index 01c4f8b2c8719..d7d83b0ce8d64 100644
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@@ -19,23 +19,25 @@
   endif()
 
   # Add search paths for default rocm installation
-  list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm)
+  list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm $ENV{HIP_PATH})
 
-  find_package(hip)
-  find_package(migraphx PATHS ${AMD_MIGRAPHX_HOME})
+  # Suppress the warning about the small capitals of the package name - Enable when support to CMake 3.27.0 is used
+  # cmake_policy(SET CMP0144 NEW)
 
-  find_package(miopen)
-  find_package(rocblas)
+  if(WIN32 AND NOT HIP_PLATFORM)
+    set(HIP_PLATFORM "amd")
+  endif()
+
+  find_package(hip REQUIRED)
+  find_package(migraphx REQUIRED PATHS ${AMD_MIGRAPHX_HOME})
 
-  set(migraphx_libs migraphx::c hip::host MIOpen roc::rocblas)
+  set(migraphx_libs migraphx::c hip::host)
 
   file(GLOB_RECURSE onnxruntime_providers_migraphx_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.cc"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
-    "${ONNXRUNTIME_ROOT}/core/providers/rocm/rocm_stream_handle.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/rocm/rocm_stream_handle.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_migraphx_cc_srcs})
   onnxruntime_add_shared_library_module(onnxruntime_providers_migraphx ${onnxruntime_providers_migraphx_cc_srcs})
@@ -46,18 +48,16 @@
   set_target_properties(onnxruntime_providers_migraphx PROPERTIES LINKER_LANGUAGE CXX)
   set_target_properties(onnxruntime_providers_migraphx PROPERTIES FOLDER "ONNXRuntime")
   target_compile_definitions(onnxruntime_providers_migraphx PRIVATE ONNXIFI_BUILD_LIBRARY=1)
-  target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare)
-  set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
-  set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections")
-  target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp)
-
-  include(CheckLibraryExists)
-  check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC)
-  if(HAS_STREAM_SYNC)
-      target_compile_definitions(onnxruntime_providers_migraphx PRIVATE -DMIGRAPHX_STREAM_SYNC)
-      message(STATUS "MIGRAPHX GPU STREAM SYNC is ENABLED")
+  if(MSVC)
+    set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS /DEF:${ONNXRUNTIME_ROOT}/core/providers/migraphx/symbols.def)
+    target_link_libraries(onnxruntime_providers_migraphx PRIVATE ws2_32)
   else()
-      message(STATUS "MIGRAPHX GPU STREAM SYNC is DISABLED")
+    target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare)
+    set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
+  endif()
+  if(UNIX)
+    set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections")
+    target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs)
   endif()
 
   if (onnxruntime_ENABLE_TRAINING_OPS)
@@ -68,8 +68,16 @@
     endif()
   endif()
 
-  install(TARGETS onnxruntime_providers_migraphx
-          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
-  )
+  if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    install(TARGETS onnxruntime_providers_migraphx
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_BINDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+  else()
+    install(TARGETS onnxruntime_providers_migraphx
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+  endif()
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 5876b2b5c448b..d738e29101cfe 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -45,11 +45,6 @@
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
   target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS})
 
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-  target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_STRING=\"${VERSION_STRING}\")
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 
   if(MSVC)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 711a9f77f9094..0159c35d1941b 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1225,6 +1225,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       if (CMAKE_SYSTEM_NAME STREQUAL "Android")
         list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
       endif()
+      if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+        list(APPEND onnxruntime_perf_test_libs onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 gtest absl_failure_signal_handler absl_examine_stack absl_flags_parse  absl_flags_usage absl_flags_usage_internal)
+    endif()
       target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
       if(WIN32)
         target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
@@ -1275,6 +1278,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
     endif()
 
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2)
+    endif()
+
     AddTest(DYN
             TARGET onnxruntime_shared_lib_test
             SOURCES ${onnxruntime_shared_lib_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1510,7 +1517,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.lds -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
     endif()
   else()
@@ -1574,6 +1581,9 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+    endif()
     AddTest(DYN
             TARGET onnxruntime_customopregistration_test
             SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1608,7 +1618,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_invalid_library/custom_op_library.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1639,7 +1649,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_get_const_input_test_library/custom_op_lib.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1671,7 +1681,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1690,6 +1700,9 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
        ${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc)
 
   set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils)
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_framework onnxruntime_common onnxruntime_graph  onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+     endif()
 
   if(NOT WIN32)
     list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS})
@@ -1753,7 +1766,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/exported_symbols.lst")
   elseif(UNIX)
-    set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+     endif()
   elseif(WIN32)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/symbols.def")
   else()
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index fbe8db37ecb0e..9fb58e301bba8 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -10,3 +10,21 @@ index 3987eac9..5e5462f1 100644
 +  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
  endif()
  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h
+index bc828a31..3d3effe8 100644
+--- a/include/flatbuffers/flatbuffers.h
++++ b/include/flatbuffers/flatbuffers.h
+@@ -213,7 +213,12 @@ inline const char * const *ElementaryTypeNames() {
+ // We're explicitly defining the signedness since the signedness of integer
+ // bitfields is otherwise implementation-defined and causes warnings on older
+ // GCC compilers.
+-struct TypeCode {
++
++struct
++#if defined(_AIX) && defined(__clang__)
++__attribute__((packed))
++#endif
++TypeCode {
+   // ElementaryType
+   unsigned short base_type : 4;
+   // Either vector (in table) or array (in struct)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index d74250b962628..ff6b71217ad87 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -718,11 +718,6 @@ target_compile_definitions(winml_dll PRIVATE ONNX_ML)
 target_compile_definitions(winml_dll PRIVATE LOTUS_LOG_THRESHOLD=2)
 target_compile_definitions(winml_dll PRIVATE LOTUS_ENABLE_STDERR_LOGGING)
 target_compile_definitions(winml_dll PRIVATE PLATFORM_WINDOWS)
-target_compile_definitions(winml_dll PRIVATE VER_MAJOR=${VERSION_MAJOR_PART})
-target_compile_definitions(winml_dll PRIVATE VER_MINOR=${VERSION_MINOR_PART})
-target_compile_definitions(winml_dll PRIVATE VER_BUILD=${VERSION_BUILD_PART})
-target_compile_definitions(winml_dll PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART})
-target_compile_definitions(winml_dll PRIVATE VER_STRING=\"${VERSION_STRING}\")
 target_compile_definitions(winml_dll PRIVATE BINARY_NAME=\"${BINARY_NAME}\")
 
 if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
diff --git a/include/onnxruntime/core/graph/basic_types.h b/include/onnxruntime/core/graph/basic_types.h
index 36984d0405bbd..cdd5e4c1e571b 100644
--- a/include/onnxruntime/core/graph/basic_types.h
+++ b/include/onnxruntime/core/graph/basic_types.h
@@ -19,6 +19,8 @@ class TensorProto;
 class SparseTensorProto;
 class TypeProto;
 class AttributeProto;
+class FunctionProto;
+class OperatorSetIdProto;
 // define types that would come from the ONNX library if we were building against it.
 #if defined(ORT_MINIMAL_BUILD)
 using OperatorSetVersion = int;
diff --git a/java/build-android.gradle b/java/build-android.gradle
index afbad9f03d08d..fd22fa27e8db9 100644
--- a/java/build-android.gradle
+++ b/java/build-android.gradle
@@ -105,7 +105,7 @@ task sourcesJar(type: Jar) {
 
 task javadoc(type: Javadoc) {
 	source = android.sourceSets.main.java.srcDirs
-	classpath += project.files(android.getBootClasspath().join(File.pathSeparator))
+	classpath += project.files(android.getBootClasspath())
 }
 
 task javadocJar(type: Jar, dependsOn: javadoc) {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index 29c7941e6bd30..9b37247167bab 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -328,13 +328,6 @@ fn main(@builtin(local_invocation_id) localId : vec3<u32>,
     var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'};
 
     var acc : array<array<${type}, colPerThread>, rowPerThread>;
-
-    // Without this initialization strange values show up in acc.
-    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
-      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
-        acc[innerRow][innerCol] = 0.0;
-      }
-    }
     ${matmulSnippet}
   }
 `;
diff --git a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
index ec504d215920f..000c590f32616 100644
--- a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
+++ b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
@@ -8,6 +8,8 @@
 /* Modifications Copyright (c) Microsoft. */
 
 #include "contrib_ops/cpu/murmur_hash3.h"
+#include <memory>
+#include <utility>
 
 // Platform-specific functions and macros
 
@@ -60,11 +62,31 @@ inline uint64_t rotl64(uint64_t x, int8_t r) {
 // handle aligned reads, do the conversion here
 
 FORCE_INLINE uint32_t getblock(const uint32_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint32_t)c[0] |
+           (uint32_t)c[1] << 8 |
+           (uint32_t)c[2] << 16 |
+           (uint32_t)c[3] << 24;
+  }
 }
 
 FORCE_INLINE uint64_t getblock(const uint64_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint64_t)c[0] |
+           (uint64_t)c[1] << 8 |
+           (uint64_t)c[2] << 16 |
+           (uint64_t)c[3] << 24 |
+           (uint64_t)c[4] << 32 |
+           (uint64_t)c[5] << 40 |
+           (uint64_t)c[6] << 48 |
+           (uint64_t)c[7] << 56;
+  }
 }
 
 //-----------------------------------------------------------------------------
@@ -204,13 +226,35 @@ Status MurmurHash3::Compute(OpKernelContext* ctx) const {
     int input_num_bytes = static_cast<int>(input_element_bytes);
     ORT_ENFORCE(input_num_bytes % 4 == 0);
     const auto input_end = input + input_count * input_num_bytes;
-    while (input != input_end) {
-      MurmurHash3_x86_32(input,
-                         input_num_bytes,
-                         seed_,
-                         output);
-      input += input_num_bytes;
-      ++output;
+
+    if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+      while (input != input_end) {
+        MurmurHash3_x86_32(input,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
+    } else {
+      // Big endian platform require byte swapping.
+      auto raw_data = std::make_unique<char[]>(input_num_bytes);
+      char* raw_data_ptr = raw_data.get();
+      while (input != input_end) {
+        memcpy(raw_data_ptr, input, input_num_bytes);
+        char* start_byte = raw_data_ptr;
+        char* end_byte = start_byte + input_num_bytes - 1;
+        for (size_t count = 0; count < static_cast<size_t>(input_num_bytes / 2); ++count) {
+          std::swap(*start_byte++, *end_byte--);
+        }
+
+        MurmurHash3_x86_32(raw_data_ptr,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
     }
   }
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
index 7e343d85f4048..b28f3758f89b5 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -40,6 +40,13 @@ void Dequantize4BitsKernelReOrder(
   }
   T* output_i = output + out_y * out_cols + out_x;
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::big) {
+    const uint8_t* c = (const uint8_t*)(&quant_value);
+    quant_value = (uint32_t)c[0] |
+                  (uint32_t)c[1] << 8 |
+                  (uint32_t)c[2] << 16 |
+                  (uint32_t)c[3] << 24;
+  }
   const int remain_x = std::min(8, out_cols - out_x);
   const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1));
   for (int i = 0; i < remain_x; i++) {
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index e8086877a9159..4ecd61962d797 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <string>
 #include <filesystem>
 #if defined(__wasm__)
 #include <emscripten.h>
@@ -260,7 +261,89 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo
 
 namespace utils {
 
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param) {
+  tensor_proto.set_raw_data(std::move(param));
+}
+
+void ConvertRawDataInTensorProto(TensorProto* tensor) {
+  size_t element_size = 1;
+  char* bytes = NULL;
+  size_t num_elements = 0;
+  switch (tensor->data_type()) {
+    case TensorProto_DataType_FLOAT:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+
+    case TensorProto_DataType_INT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(int32_t);
+      break;
+
+    case TensorProto_DataType_UINT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint32_t);
+      break;
+
+    case TensorProto_DataType_UINT8:
+    case TensorProto_DataType_INT8:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint8_t);
+      break;
+
+    case TensorProto_DataType_UINT16:
+    case TensorProto_DataType_INT16:
+    case TensorProto_DataType_FLOAT16:
+    case TensorProto_DataType_BFLOAT16:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint16_t);
+      break;
+
+    case TensorProto_DataType_UINT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_uint64_data()->mutable_data());
+      num_elements = tensor->uint64_data_size();
+      element_size = sizeof(uint64_t);
+      break;
+
+    case TensorProto_DataType_DOUBLE:
+      bytes = reinterpret_cast<char*>(tensor->mutable_double_data()->mutable_data());
+      num_elements = tensor->double_data_size();
+      element_size = sizeof(double);
+      break;
+
+    case TensorProto_DataType_INT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int64_data()->mutable_data());
+      num_elements = tensor->int64_data_size();
+      element_size = sizeof(int64_t);
+      break;
+
+    case TensorProto_DataType_COMPLEX64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+  }
+  if (tensor->has_raw_data()) {
+    num_elements = (tensor->raw_data().size()) / element_size;
+    bytes = const_cast<char*>(tensor->mutable_raw_data()->c_str());
+  }
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+  return;
+}
+
 #if !defined(ORT_MINIMAL_BUILD)
+
 static Status UnpackTensorWithExternalDataImpl(const ONNX_NAMESPACE::TensorProto& tensor,
                                                const std::filesystem::path& tensor_proto_dir,
                                                size_t expected_num_elements, size_t element_size,
@@ -1159,11 +1242,6 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
 }
 
 ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
-  // Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
-  if constexpr (endian::native != endian::little) {
-    ORT_THROW("Big endian not supported");
-  }
-
   // Set name, dimensions, type, and data of the TensorProto.
   ONNX_NAMESPACE::TensorProto tensor_proto;
 
@@ -1182,7 +1260,7 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std:
       *mutable_string_data->Add() = *f;
     }
   } else {
-    tensor_proto.set_raw_data(tensor.DataRaw(), tensor.SizeInBytes());
+    utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), tensor.SizeInBytes());
   }
 
   return tensor_proto;
@@ -1464,8 +1542,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 
       ORT_RETURN_IF_ERROR(status);
     }
-    dense.set_raw_data(std::move(dense_data_storage));
-
+    utils::SetRawDataInTensorProto(dense, std::move(dense_data_storage));
   } else {
     // No request for std::string
     status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ",
@@ -1510,7 +1587,17 @@ static void SetIndices(gsl::span<int64_t> gathered_indices, std::string& raw_ind
     } else {
       auto* dst = ind_dest + dest_index;
       T v = static_cast<T>(src_index);
-      memcpy(dst, &v, sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        auto src = gsl::make_span<const unsigned char>(static_cast<const unsigned char*>(
+                                                           reinterpret_cast<const unsigned char*>(&v)),
+                                                       sizeof(T));
+        auto dest = gsl::make_span<unsigned char>(static_cast<unsigned char*>(
+                                                      reinterpret_cast<unsigned char*>(dst)),
+                                                  sizeof(T));
+        onnxruntime::utils::SwapByteOrderCopy(sizeof(T), src, dest);
+      } else {
+        memcpy(dst, &v, sizeof(T));
+      }
     }
     ++dest_index;
   }
@@ -1561,7 +1648,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
     }
   } else {
     indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    indices.set_raw_data(std::string());
+    utils::SetRawDataInTensorProto(indices, std::string());
   }
   nnz = gathered_indices.size();
 }
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index a66caf1ace33b..aabfc0487f3e0 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -5,6 +5,7 @@
 
 #include <vector>
 #include <type_traits>
+#include <string>
 #include <filesystem>
 
 #ifndef SHARED_PROVIDER
@@ -19,6 +20,46 @@
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
 
+namespace onnxruntime {
+namespace utils {
+/**
+ * This function is used to convert the endianess of Tensor data.
+ * Mostly, will be used in big endian system to support the model file
+ * generated on little endian system.
+ * @param initializer       given initializer tensor
+ * @returns                 None
+ */
+void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* initializer);
+
+/**
+ * Wrapper function for set_raw_data.
+ * First calls the set_raw_data and then calls ConvertRawDataInTensorProto
+ * under big endian system.
+ * @param tensor_proto given initializer tensor
+ * @param raw_data     source raw_data pointer
+ * @param raw_data_len  length of raw_data
+ * @returns                 None
+ */
+template <typename T1, typename T2>
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_data, T2 raw_data_len) {
+  using namespace ONNX_NAMESPACE;
+  tensor_proto.set_raw_data(raw_data, raw_data_len);
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&tensor_proto);
+  }
+}
+
+/**
+ * Overload Wrapper function for set_raw_data handling string object.
+ * Forward the string object to set_raw_data.
+ * @param tensor_proto given initializer tensor
+ * @param param   string object reference
+ * @returns                 None
+ */
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param);
+}  // namespace utils
+}  // namespace onnxruntime
+
 namespace ONNX_NAMESPACE {
 class TensorProto;
 class TensorShapeProto;
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index f73a50db7aaa4..442a0db933d65 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1199,6 +1199,15 @@ Graph::Graph(const Model& owning_model,
 
     const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
     auto status = utils::ConstantNodeProtoToTensorProto(node, model_path, *tensor);
+    if constexpr (endian::native != endian::little) {
+      const AttributeProto& attrib = node.attribute(0);
+      if (attrib.type() == AttributeProto_AttributeType_SPARSE_TENSOR) {
+        const TensorProto& sparse_values = node.attribute(0).sparse_tensor().values();
+        if ((!(sparse_values.has_raw_data())) && tensor->has_raw_data()) {
+          onnxruntime::utils::ConvertRawDataInTensorProto(tensor);
+        }
+      }
+    }
     ORT_ENFORCE(status.IsOK(), status.ToString());
     // Ensure initializers are also graph inputs.
     if (ir_version_ < 4) {
@@ -3716,6 +3725,12 @@ SaveInputsOutputsToOrtFormat(flatbuffers::FlatBufferBuilder& builder, const std:
 
 common::Status Graph::SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                       flatbuffers::Offset<fbs::Graph>& fbs_graph) const {
+  if constexpr (endian::native != endian::little) {
+    auto& tens = GetAllInitializedTensors();
+    for (auto& [name, tensor_p] : tens) {
+      utils::ConvertRawDataInTensorProto(const_cast<TensorProto*>(tensor_p));
+    }
+  }
   auto inputs = SaveInputsOutputsToOrtFormat(builder, graph_inputs_including_initializers_);
   auto outputs = SaveInputsOutputsToOrtFormat(builder, graph_outputs_);
 
diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index 898fb23cf3e4f..aec14070ffd55 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -360,12 +360,12 @@ MlasDequantizeBlockwise(
     );
 
 /**
- * @brief Blockwise 2 bits or 4 bits quantization. After quantization, the weights and zero points
- *        are packed row-wise. In terms of the qbits type, dst and src have the same shape, and
- *        scales and zero_points have the same shape.
- *        columns must be multiple of 8 / qbits.
+ * @brief Blockwise 4 bits quantization. After quantization, the weights and zero points
+ *        are packed row-wise. If zero_points is null, quantized type is int4 with default
+ *        zero point 0, to align with DQ schema. Otherwise, quantized type is uint4.
+ *        In int4/uint4, dst have the same shape as src, and zero_points have the same shape as scales.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
  * @param src               points to the floating point matrix, to be quantized, row major shape [rows, columns]
  * @param scales            points to the scales matrix, row major
  * @param zero_points       points to the zero_points matrix, row major
@@ -376,9 +376,10 @@ MlasDequantizeBlockwise(
  * @param columns
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
+ * @return the quantized type is signed.
  */
 template <typename Tin, int qbits>
-void
+bool
 MlasQDQQuantizeBlockwise(
     const Tin* src,
     Tin* scales,
@@ -395,8 +396,17 @@ MlasQDQQuantizeBlockwise(
  * @brief Transpose blockwise quantized tensors. The src tensors are row major. src weights and zero
  *        points are packed row-wise. The dst tensors are column major. dst weights and zero points
  *        are packed column-wise.
+ *        dst_weights and dst_zero_points are in uint4.
+ *        If src_weights is int4 and has src_zero_points, src_weights and src_zero_points are
+ *        converted to uint4 by adding 8.
+ *        If src_weights is int4 and no src_zero_points, src_weights is converted to uint4 by adding 8.
+ *        src_zero_points is 0 and dst_zero_points is 8.
+ *        If src_weights is uint4 and has src_zero_points, just transpose.
+ *        If src_weights is uint4 and no src_zero_points, caller must allocate dst_zero_points with
+ *        0 values. Otherwise exception is thrown.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
+ * @tparam signed_quant     true when quantized type is signed, false when quantized type is unsigned
  * @param src_weights       points to the quantized matrix, row major, shape [rows, columns] in qbits type.
  *                          In uint8_t type, shape is [rows, columns * qbits / 8].
  * @param src_scales        points to the scales matrix, row major
@@ -410,7 +420,7 @@ MlasQDQQuantizeBlockwise(
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
  */
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 void
 MlasQDQTransposeBlockwiseQuantized(
     const uint8_t* src_weights,
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 72eb35c894094..859b7c2f560a4 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -20,8 +20,15 @@ Module Name:
 #include <thread>
 #include <mutex>
 
-#if defined(MLAS_TARGET_POWER) && defined(__linux__)
+#if defined(MLAS_TARGET_POWER) 
+#if defined(__linux__)
 #include <sys/auxv.h>
+#elif defined(_AIX)
+#define POWER_10       0x40000
+#define POWER_10_ANDUP (POWER_10)
+#include <sys/systemcfg.h>
+#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+#endif
 #endif
 
 #if defined(MLAS_TARGET_ARM64)
@@ -554,6 +561,9 @@ Return Value:
     unsigned long hwcap2 = getauxval(AT_HWCAP2);
 
     bool HasP9Instructions = hwcap2 & PPC_FEATURE2_ARCH_3_00;
+#elif defined(_AIX)
+    bool HasP9Instructions = __power_9_andup();
+#endif // __linux__
     if (HasP9Instructions) {
         this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelVSX;
         this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelVSX;
@@ -562,7 +572,11 @@ Return Value:
 #if defined(POWER10)
 #if (defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12))
+#if defined(__linux__)
     bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
+#elif defined(_AIX)
+    bool HasP10Instructions = (__power_10_andup() && __power_mma_version() == MMA_V31);
+#endif // __linux__
     if (HasP10Instructions) {
         this->GemmFloatKernel = MlasSgemmKernelPOWER10;
         this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
@@ -571,7 +585,6 @@ Return Value:
 #endif
 #endif
 
-#endif // __linux__
 #endif // MLAS_TARGET_POWER
 
 #if defined(MLAS_TARGET_LARCH64)
@@ -676,7 +689,6 @@ MlasPlatformU8S8Overflow(
 }
 
 #endif
-
 thread_local size_t ThreadedBufSize = 0;
 #ifdef _MSC_VER
 thread_local std::unique_ptr<uint8_t, decltype(&_aligned_free)> ThreadedBufHolder(nullptr, &_aligned_free);
diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
index a67be1dbfa710..0f3bc1d579711 100644
--- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
+++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
@@ -874,10 +874,18 @@ MlasQgemmStoreVectorMMA
 {
     size_t RowCount;
     __vector signed int vsum0, vsum1, vsum2, vsum3;
+#if defined(_AIX) && defined(__clang__)
+    __vector signed int columnsum = *reinterpret_cast<const __vector int *>(&ColumnSumBuffer[pos]);
+#else
     __vector signed int columnsum = *reinterpret_cast<const __vector int32_t *>(&ColumnSumBuffer[pos]);
+#endif
     C += VectorCount;
     if (ZeroPointB != nullptr) {
+#if defined(_AIX) && defined(__clang__)
+        __vector signed int zeropoint = *reinterpret_cast<const __vector int *>(&ZeroPointB[pos]);
+#else
         __vector signed int zeropoint = *reinterpret_cast<const __vector int32_t *>(&ZeroPointB[pos]);
+#endif
         if (ZeroMode) {
             for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
                 vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum;
diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp
index 62fe58ca333de..015d69de68766 100644
--- a/onnxruntime/core/mlas/lib/q4_dq.cpp
+++ b/onnxruntime/core/mlas/lib/q4_dq.cpp
@@ -314,14 +314,18 @@ struct Shape2D {
 };
 
 
-template <int qbits>
+template <int qbits, bool signed_quant>
 struct BitsTraits {
     static_assert(qbits <= 8, "Only BitsTraits are for small number of bits!");
 
     static constexpr int kBits = qbits;
-    static constexpr int kMax = (1 << qbits) - 1;
-    static constexpr int kMid = 1 << (qbits - 1);
+    static constexpr int kMax = signed_quant ? (1 << (qbits -1)) - 1 : (1 << qbits) - 1;
+    static constexpr int kMid = signed_quant ? 0 : (1 << (qbits - 1));
+    static constexpr int kMin = signed_quant ? -(1 << (qbits - 1)) : 0;
     static constexpr float kMaxFp = static_cast<float>(kMax);
+    static constexpr float kMinFp = static_cast<float>(kMin);
+    static constexpr float fullRange = kMaxFp - kMinFp;
+    static constexpr float halfRange = static_cast<float>(kMid - kMin);
 
     // number of qbit elements to pack into whole bytes
     static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0;
@@ -331,53 +335,54 @@ struct BitsTraits {
 
 /**
  * @brief Rectify min/max from a set of weights, and convert to scale and zero point
- *        for quantization
- * @tparam ScaleT   type of scale, usually floating point of various bits
- * @tparam qbits  number of int bits used for zero point value
+ *        for quantization.
+ * @tparam ScaleT        type of scale, usually floating point of various bits
+ * @tparam qbits         number of int bits used for zero point value
+ * @tparam signed_quant  output quantized type is signed
  * @param[in]   min
  * @param[in]   max
  * @param[out]  scale
  * @param[out]  zp
  */
-template <typename ScaleT, int qbits>
+template <typename ScaleT, int qbits, bool signed_quant>
 MLAS_FORCEINLINE
 void
 range2scalezp(float min, float max, ScaleT& scale, uint8_t& zp)
 {
-    constexpr int zp_max = BitsTraits<qbits>::kMax;
-    constexpr float zp_max_fp = BitsTraits<qbits>::kMaxFp;
-
     min = std::min(min, 0.0f);
     max = std::max(max, 0.0f);
 
-    float scale_f = (max - min) / zp_max;
+    float scale_f = (max - min) / BitsTraits<qbits, signed_quant>::fullRange;
 
     float zero_point_fp = min;
     if (scale_f != 0.0f) {
-        zero_point_fp = 0.f - min / scale_f;
+        zero_point_fp = BitsTraits<qbits, signed_quant>::kMinFp - min / scale_f;
     }
 
-    if (zero_point_fp < 0.0f) {
-        zp = 0;
-    } else if (zero_point_fp > zp_max_fp) {
-        zp = zp_max;
+    if (zero_point_fp < BitsTraits<qbits, signed_quant>::kMinFp) {
+        zp = static_cast<uint8_t>(BitsTraits<qbits, signed_quant>::kMin);
+    } else if (zero_point_fp > BitsTraits<qbits, signed_quant>::kMaxFp) {
+        zp = static_cast<uint8_t>(BitsTraits<qbits, signed_quant>::kMax);
     } else {
         zp = (uint8_t)roundf(zero_point_fp);
     }
     scale = ScaleT(scale_f);
 }
 
-template <typename ScaleT, int qbits>
+/**
+ * @brief Rectify min/max from a set of symmetric weights, and convert
+ *        to scale for quantization.
+ */
+template <typename ScaleT, int qbits, bool signed_quant>
 MLAS_FORCEINLINE
 void
 range2scale(float min, float max, ScaleT& scale)
 {
-    constexpr int mid_v = BitsTraits<qbits>::kMid;
-    constexpr float mid_fp = static_cast<float>(-mid_v);
-
     max = fabsf(max) > fabsf(min) ? max : min;
-
-    scale = ScaleT(max / mid_fp);
+    // !!Note: in the quantized space, abs of min -8 > abs of max 7.
+    // Therefore map the larger half FP space to [-8, 0].
+    // Minus sign achieves this purpose.
+    scale = ScaleT(-max / BitsTraits<qbits, signed_quant>::halfRange);
 };
 
 
@@ -400,7 +405,7 @@ struct BlockwiseQuantizer {
     static_assert(qbits == 4, "Only 4b block quantization is supported!");
 
     using QuantBlk = std::conditional_t<Columnwise, Shape2D<block_size, 1>, Shape2D<1, block_size>>;
-    using ThreadBlk = Shape2D<QuantBlk::kRow * BitsTraits<qbits>::kPackSize, QuantBlk::kColumn>;
+    using ThreadBlk = Shape2D<QuantBlk::kRow * BitsTraits<qbits, false>::kPackSize, QuantBlk::kColumn>;
 
     static
     MLAS_FORCEINLINE
@@ -474,8 +479,8 @@ struct BlockwiseQuantizer {
         MlasTryBatchParallel(
             thread_pool, total_thrd_blks,
             [&](ptrdiff_t block_idx) {
-                uint8_t zp_bytes[BitsTraits<qbits>::kPackSize];
-                std::fill_n(zp_bytes, BitsTraits<qbits>::kPackSize, (uint8_t)8);
+                uint8_t zp_bytes[BitsTraits<qbits, false>::kPackSize];
+                std::fill_n(zp_bytes, BitsTraits<qbits, false>::kPackSize, (uint8_t)8);
 
                 const int32_t r_blk_idx = static_cast<int32_t>(block_idx / thrd_col_blks);
                 const int32_t c_blk_idx = static_cast<int32_t>(block_idx % thrd_col_blks);
@@ -490,7 +495,7 @@ struct BlockwiseQuantizer {
                 const int meta_col = c / QuantBlk::kColumn;
 
                 // compute scale and zero point
-                for (int kpack = 0; kpack < BitsTraits<qbits>::kPackSize; kpack++) {
+                for (int kpack = 0; kpack < BitsTraits<qbits, false>::kPackSize; kpack++) {
 
                     // scan a single block to extract range [min, max]
                     float min = std::numeric_limits<float>::max();
@@ -509,9 +514,9 @@ struct BlockwiseQuantizer {
                     if (row_start < row_end) {
                         const int32_t meta_idx = meta_col * row_blks + meta_row + kpack;
                         if (zero_points == nullptr) {
-                            range2scale<ElementT, qbits>(min, max, scales[meta_idx]);
+                            range2scale<ElementT, qbits, false>(min, max, scales[meta_idx]);
                         } else {
-                            range2scalezp<ElementT, qbits>(min, max, scales[meta_idx], zp_bytes[kpack]);
+                            range2scalezp<ElementT, qbits, false>(min, max, scales[meta_idx], zp_bytes[kpack]);
                         }
                     }
                 }
@@ -533,7 +538,7 @@ struct BlockwiseQuantizer {
 
                         const float v0 = static_cast<float>(src[i * leadingDimension + j]);
                         const uint8_t vi0 = (uint8_t)std::clamp(roundf(v0 * reciprocal_scale + zp),
-                                                                0.0f, BitsTraits<qbits>::kMaxFp);
+                                                                0.0f, BitsTraits<qbits, false>::kMaxFp);
 
                         uint8_t vi1 = (uint8_t)zp;
                         if (i + 1 < r_end) {
@@ -545,7 +550,7 @@ struct BlockwiseQuantizer {
                             }
                             const float v1 = static_cast<float>(src[(i + 1) * leadingDimension + j]);
                             vi1 = (uint8_t)std::clamp(roundf(v1 * reciprocal_scale1 + zp1), 0.0f,
-                                                      BitsTraits<qbits>::kMaxFp);
+                                                      BitsTraits<qbits, false>::kMaxFp);
                         }
 
                         // !! 4b specific code
@@ -644,14 +649,19 @@ struct BlockwiseQuantizer {
  *        in memory are packed together, which means the packing is along the row. Quantized data
  *        are stored in row major, so the output tensor reserves same shape, in terms of qbits type,
  *        as the input tensor.
- * @tparam Tin    source data type, e.g. fp32/fp16
- * @tparam qbits  number of bits in each quantized element
+ *        If has zero points, quantized type is unsigned. Otherwise, quantized type is signed and the
+ *        zero point is 0.
+ *        The transposed outputs are used by MatMulNBits, so quant type becomes uint4 with default
+ *        zp at 8.
+ * @tparam Tin           source data type, e.g. fp32/fp16
+ * @tparam qbits         number of bits in each quantized element
+ * @tparam signed_quant  quantized type is signed
  */
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 struct BlockwiseQDQQuantizer;
 
-template <typename Tin>
-struct BlockwiseQDQQuantizer<Tin, 4> {
+template <typename Tin, bool signed_quant>
+struct BlockwiseQDQQuantizer<Tin, 4, signed_quant> {
     static MLAS_FORCEINLINE uint8_t GetElem(uint8_t val, int32_t idx)
     {
         return (val >> (idx << 2)) & 0xF;
@@ -663,9 +673,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         return ((val & 0xF) << shift) | (dst & (~(0xF << shift)));
     }
 
+    template <bool add8>
     static MLAS_FORCEINLINE uint8_t Pack(uint8_t v0, uint8_t v1)
     {
-        return (v0 & 0xF) | ((v1 & 0xF) << 4);
+        if constexpr (add8) {
+            return ((v0 & 0xF) ^ 8) | (((v1 & 0xF) ^ 8) << 4);
+        } else {
+            return (v0 & 0xF) | ((v1 & 0xF) << 4);
+        }
     }
 
     // If src is row major, then dst is column major. Transpose:
@@ -680,10 +695,16 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
     //  -->
     //  | dst0: low 4 bit | dst0: high 4 bit |
     //  | dst1: low 4 bit | dst1: high 4 bit |
+    template <bool add8>
     static MLAS_FORCEINLINE void Transpose(uint8_t src0, uint8_t src1, uint8_t& dst0, uint8_t& dst1)
     {
-        dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4);
-        dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0);
+        if constexpr (add8) {
+            dst0 = ((src0 & 0xF) ^ 8) | (((src1 & 0xF) ^ 8) << 4);
+            dst1 = (((src0 & 0xF0) ^ 0x80) >> 4) | ((src1 & 0xF0) ^ 0x80);
+        } else {
+            dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4);
+            dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0);
+        }
     }
 
     static MLAS_FORCEINLINE uint8_t QuantizeV(Tin src, float reciprocal_scale, uint8_t zero_point)
@@ -693,54 +714,12 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 static_cast<int32_t>(
                     std::roundf(static_cast<float>(src) * reciprocal_scale)
                 ) + static_cast<int32_t>(zero_point),
-                0,
-                BitsTraits<4>::kMax
+                BitsTraits<4, signed_quant>::kMin,
+                BitsTraits<4, signed_quant>::kMax
             )
         );
     }
 
-    /**
-     * @brief Quantize a matrix shape [rows, columns] row-wise. Scales and zero points are calculated.
-     *        Quantized data are packed row-wise based on qbits. Quantized data are stored in row
-     *        major, so the output tensor reserves the shape, in terms output type.
-     *        Thread block is [1, quant_block_size * 2].
-     * @param src               the source matrix, row major: [rows * columns]
-     * @param scales            the scales of quantized blocks, row major layout with shape:
-     *                          [rows * ceil(columns / quant_block_size)]
-     * @param zero_points       the zero points of quantized blocks, packed. Same shape as scales
-     *                          in terms of output type. In terms of uint8_t, the shape is:
-     *                          [ceil(rows * ceil(columns / quant_block_size) * qbits / 8)]
-     * @param dst               the quantized weights, row major: [rows * columns] in terms of
-     *                          output type. In terms of uint8_t, the shape is: [ceil(rows * columns * qbits / 8]
-     * @param rows              number of rows in the source matrix
-     * @param columns           number of columns in the source matrix, must satisfy
-     *                          ceil(columns / quant_block_size) % 2 == 0, so in each thread block,
-     *                          zero points are packed into one byte.
-     * @param quant_block_size  number of elements quantized together.
-     * @param thread_pool       thread pool for parallel processing
-     */
-    static void QuantizeRowWise(
-        const Tin* src,
-        Tin* scales,
-        uint8_t* zero_points,
-        uint8_t* dst,
-        int32_t rows,
-        int32_t columns,
-        int32_t quant_block_size,
-        MLAS_THREADPOOL* thread_pool
-    )
-    {
-        MLAS_UNREFERENCED_PARAMETER(src);
-        MLAS_UNREFERENCED_PARAMETER(scales);
-        MLAS_UNREFERENCED_PARAMETER(zero_points);
-        MLAS_UNREFERENCED_PARAMETER(dst);
-        MLAS_UNREFERENCED_PARAMETER(rows);
-        MLAS_UNREFERENCED_PARAMETER(columns);
-        MLAS_UNREFERENCED_PARAMETER(quant_block_size);
-        MLAS_UNREFERENCED_PARAMETER(thread_pool);
-        ORT_THROW("BlockwiseQDQQuantizer::BlockwiseQDQQuantizer is not implemented");
-    }
-
     /**
      * @brief Quantize a matrix shape [rows, columns] column-wise. Scales and zero points are calculated.
      *        Quantized data are packed row-wise based on qbits. Quantized data are stored in row major
@@ -769,6 +748,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         MLAS_THREADPOOL* thread_pool
     )
     {
+        ORT_ENFORCE(zero_points || signed_quant, "Unsigned quant with no zero points is not supported.");
         // Must avoid multiple thread write to a single byte, which means the starting index
         // of a thread block must be even. To achieve that, we need to customize the thread
         // block size based on the parity of columns.
@@ -815,6 +795,10 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
         MLAS_THREADPOOL* thread_pool
     )
     {
+        ORT_ENFORCE(
+            src_zero_points || signed_quant || dst_zero_points,
+            "Unsigned quant types without zero points must allocate zero points with value 0."
+        );
         // Must avoid multiple thread write to a single byte, which means the starting index
         // of a thread block must be even. To achieve that, we need to customize the thread
         // block size based on the parity of columns.
@@ -896,15 +880,15 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
 
                 // calculate scale and zero point, and store
                 for (int32_t i = 0; i < col_size; i += 2) {
-                    v0_tt = v1_tt = BitsTraits<4>::kMid;
+                    v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid;
 
                     if (zero_points) {
-                        range2scalezp<Tin, 4>(vmin_t[i], vmax_t[i], scale0_tt, v0_tt);
-                        range2scalezp<Tin, 4>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt);
-                        zero_points[(scale_idx + i) >> 1] = Pack(v0_tt, v1_tt);
+                        range2scalezp<Tin, 4, signed_quant>(vmin_t[i], vmax_t[i], scale0_tt, v0_tt);
+                        range2scalezp<Tin, 4, signed_quant>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt);
+                        zero_points[(scale_idx + i) >> 1] = Pack<false>(v0_tt, v1_tt);
                     } else {
-                        range2scale<Tin, 4>(vmin_t[i], vmax_t[i], scale0_tt);
-                        range2scale<Tin, 4>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt);
+                        range2scale<Tin, 4, signed_quant>(vmin_t[i], vmax_t[i], scale0_tt);
+                        range2scale<Tin, 4, signed_quant>(vmin_t[i + 1], vmax_t[i + 1], scale1_tt);
                     }
 
                     scales[scale_idx + i] = scale0_tt;
@@ -925,7 +909,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (int32_t i = 0; i < col_size; i += 2) {
                         v0_tt = QuantizeV(src[input_idx_t + i], reciprocal_scale_t[i], zp_t[i]);
                         v1_tt = QuantizeV(src[input_idx_t + i + 1], reciprocal_scale_t[i + 1], zp_t[i + 1]);
-                        dst[(input_idx_t + i) >> 1] = Pack(v0_tt, v1_tt);
+                        dst[(input_idx_t + i) >> 1] = Pack<false>(v0_tt, v1_tt);
                     }
                 }
             }
@@ -993,14 +977,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         int32_t col_idx = 0;
                         // leading unailgned zero points
                         if (scale_buffer_idx & 1) {
-                            v0_tt = BitsTraits<4>::kMid;
+                            v0_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[0], vmax_t[0], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[0], vmax_t[0], scale0_tt, v0_tt);
                                 zero_points[scale_buffer_idx >> 1] = SetElem(
                                     v0_tt, 1, zero_points[scale_buffer_idx >> 1]
                                 );
                             } else {
-                                range2scale<Tin, 4>(vmin_t[0], vmax_t[0], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[0], vmax_t[0], scale0_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1014,14 +998,16 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         }
                         // aligned zero points
                         for (; scale_buffer_idx < scale_buffer_idx_end - 1; col_idx += 2, scale_buffer_idx += 2) {
-                            v0_tt = v1_tt = BitsTraits<4>::kMid;
+                            v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
-                                range2scalezp<Tin, 4>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt);
-                                zero_points[scale_buffer_idx >> 1] = Pack(v0_tt, v1_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(
+                                    vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt
+                                );
+                                zero_points[scale_buffer_idx >> 1] = Pack<false>(v0_tt, v1_tt);
                             } else {
-                                range2scale<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
-                                range2scale<Tin, 4>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1037,14 +1023,14 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                         }
                         // tailing unaligned elements
                         if (scale_buffer_idx < scale_buffer_idx_end) {
-                            v0_tt = BitsTraits<4>::kMid;
+                            v0_tt = BitsTraits<4, signed_quant>::kMid;
                             if (zero_points) {
-                                range2scalezp<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
+                                range2scalezp<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt);
                                 zero_points[scale_buffer_idx >> 1] = SetElem(
                                     v0_tt, 0, zero_points[scale_buffer_idx >> 1]
                                 );
                             } else {
-                                range2scale<Tin, 4>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
+                                range2scale<Tin, 4, signed_quant>(vmin_t[col_idx], vmax_t[col_idx], scale0_tt);
                             }
 
                             scales[scale_buffer_idx] = scale0_tt;
@@ -1078,7 +1064,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                                     src[input_idx_t_start + 1], reciprocal_scale_t[col_idx + 1], zp_t[col_idx + 1]
                                 );
 
-                                dst[input_idx_t_start >> 1] = Pack(v0_tt, v1_tt);
+                                dst[input_idx_t_start >> 1] = Pack<false>(v0_tt, v1_tt);
                             }
                             // tailing unaligned output
                             if (input_idx_t_start < input_idx_t_end) {
@@ -1144,7 +1130,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     src0_t = src_weights[src_idx];
                     src1_t = src_weights[src_idx + packed_col_size];
                     src_idx += packed_col_size + packed_col_size;
-                    Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                    Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                     dst_weights[dst_idx] = dst0_t;
                     dst_weights[dst_idx + dstT_num_row] = dst1_t;
                 }
@@ -1152,7 +1138,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 if (src_idx < src_end_idx) {
                     src0_t = src_weights[src_idx];
                     src1_t = 0;
-                    Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                    Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                     dst_weights[dst_idx] = dst0_t;
                     dst_weights[dst_idx + dstT_num_row] = dst1_t;
                 }
@@ -1190,7 +1176,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (; src_idx < src_end_idx - packed_col_size; ++dst_idx) {
                         src0_t = src_zero_points[src_idx];
                         src1_t = src_zero_points[src_idx + packed_col_size];
-                        Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                        Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                         dst_zero_points[dst_idx] = dst0_t;
                         dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t;
                         src_idx += packed_col_size + packed_col_size;
@@ -1199,7 +1185,7 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     if (src_idx < src_end_idx) {
                         src0_t = src_zero_points[src_idx];
                         src1_t = 0;
-                        Transpose(src0_t, src1_t, dst0_t, dst1_t);
+                        Transpose<signed_quant>(src0_t, src1_t, dst0_t, dst1_t);
                         dst_zero_points[dst_idx] = dst0_t;
                         dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t;
                     }
@@ -1247,13 +1233,13 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                 for (; src_idx < src_end_idx - columns; ++dst_idx) {
                     src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1);
                     src1_t = GetElem(src_weights[(src_idx + columns) >> 1], (src_idx + columns) & 1);
-                    dst_weights[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4);
+                    dst_weights[dst_idx] = Pack<signed_quant>(src0_t, src1_t);
                     src_idx += columns + columns;
                 }
 
                 if (src_idx < src_end_idx) {
                     src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1);
-                    dst_weights[dst_idx] = src0_t & 0xf;
+                    dst_weights[dst_idx] = Pack<signed_quant>(src0_t, 0);
                 }
             }
         );
@@ -1288,13 +1274,13 @@ struct BlockwiseQDQQuantizer<Tin, 4> {
                     for (; src_idx < src_end_idx - columns; ++dst_idx) {
                         src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1);
                         src1_t = GetElem(src_zero_points[(src_idx + columns) >> 1], (src_idx + columns) & 1);
-                        dst_zero_points[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4);
+                        dst_zero_points[dst_idx] = Pack<signed_quant>(src0_t, src1_t);
                         src_idx += columns + columns;
                     }
 
                     if (src_idx < src_end_idx) {
                         src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1);
-                        dst_zero_points[dst_idx] = src0_t & 0xf;
+                        dst_zero_points[dst_idx] = Pack<signed_quant>(src0_t, 0);
                     }
                 }
             );
@@ -1745,7 +1731,7 @@ MlasDequantizeBlockwise<float, 4>(
 );
 
 template <typename Tin, int qbits>
-void
+bool
 MlasQDQQuantizeBlockwise(
     const Tin* src,
     Tin* scales,
@@ -1759,17 +1745,23 @@ MlasQDQQuantizeBlockwise(
 )
 {
     if (columnwise) {
-        BlockwiseQDQQuantizer<Tin, qbits>::QuantizeColumnWise(
-            src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
-        );
+        if (zero_points) {
+            BlockwiseQDQQuantizer<Tin, qbits, false>::QuantizeColumnWise(
+                src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
+            );
+            return false;
+        } else {
+            BlockwiseQDQQuantizer<Tin, qbits, true>::QuantizeColumnWise(
+                src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
+            );
+            return true;
+        }
     } else {
-        BlockwiseQDQQuantizer<Tin, qbits>::QuantizeRowWise(
-            src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool
-        );
+        ORT_THROW("Row-wise MlasQDQQuantizeBlockwise is not implemented");
     }
 }
 
-template void
+template bool
 MlasQDQQuantizeBlockwise<float, 4>(
     const float* src,
     float* scales,
@@ -1782,7 +1774,7 @@ MlasQDQQuantizeBlockwise<float, 4>(
     MLAS_THREADPOOL* thread_pool
 );
 
-template void
+template bool
 MlasQDQQuantizeBlockwise<MLAS_FP16, 4>(
     const MLAS_FP16* src,
     MLAS_FP16* scales,
@@ -1795,7 +1787,7 @@ MlasQDQQuantizeBlockwise<MLAS_FP16, 4>(
     MLAS_THREADPOOL* thread_pool
 );
 
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 void
 MlasQDQTransposeBlockwiseQuantized(
     const uint8_t* src_weights,
@@ -1812,7 +1804,7 @@ MlasQDQTransposeBlockwiseQuantized(
 )
 {
     if (columnwise) {
-        BlockwiseQDQQuantizer<Tin, qbits>::TransposeColumnWiseQuantized(
+        BlockwiseQDQQuantizer<Tin, qbits, signed_quant>::TransposeColumnWiseQuantized(
             src_weights, src_scales, src_zero_points, dst_weights, dst_scales, dst_zero_points,
             rows, columns, quant_block_size, thread_pool
         );
@@ -1822,7 +1814,22 @@ MlasQDQTransposeBlockwiseQuantized(
 }
 
 template void
-MlasQDQTransposeBlockwiseQuantized<float, 4>(
+MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+    const uint8_t* src_weights,
+    const float* src_scales,
+    const uint8_t* src_zero_points,
+    uint8_t* dst_weights,
+    float* dst_scales,
+    uint8_t* dst_zero_points,
+    bool columnwise,
+    int rows,
+    int columns,
+    int quant_block_size,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
     const uint8_t* src_weights,
     const float* src_scales,
     const uint8_t* src_zero_points,
@@ -1837,7 +1844,22 @@ MlasQDQTransposeBlockwiseQuantized<float, 4>(
 );
 
 template void
-MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4>(
+MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4, true>(
+    const uint8_t* src_weights,
+    const MLAS_FP16* src_scales,
+    const uint8_t* src_zero_points,
+    uint8_t* dst_weights,
+    MLAS_FP16* dst_scales,
+    uint8_t* dst_zero_points,
+    bool columnwise,
+    int rows,
+    int columns,
+    int quant_block_size,
+    MLAS_THREADPOOL* thread_pool
+);
+
+template void
+MlasQDQTransposeBlockwiseQuantized<MLAS_FP16, 4, false>(
     const uint8_t* src_weights,
     const MLAS_FP16* src_scales,
     const uint8_t* src_zero_points,
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 75c17a6b5a177..127aea9029b65 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -894,7 +894,7 @@ MlasGemmQuantGetDispatch(
     if (!AIsSigned) {
         GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
     }
-#elif defined(MLAS_TARGET_POWER) && defined(__linux__)  && defined(POWER10) && \
+#elif defined(MLAS_TARGET_POWER) && (defined(__linux__)  || defined(_AIX)) && defined(POWER10) && \
     ((defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12)))
     if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) {
diff --git a/onnxruntime/core/mlas/lib/qlmul.cpp b/onnxruntime/core/mlas/lib/qlmul.cpp
index 38818e1190d21..4a6d57db0d211 100644
--- a/onnxruntime/core/mlas/lib/qlmul.cpp
+++ b/onnxruntime/core/mlas/lib/qlmul.cpp
@@ -325,12 +325,20 @@ MlasQLinearMulKernel(
     }
 
     while (N >= 4) {
-        __vector int32_t IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#if defined(_AIX) && defined(__clang__)
+        __vector int IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#else
+        __vector int32_t  IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#endif
         auto IntegerVector = vec_sub(IntegerAVector, ZeroPointAVector);
         auto ValueAVector = vec_mul(ScaleAVector, vec_ctf(IntegerVector, 0));
 
         if (!IsScalarB) {
-            __vector int32_t IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#if defined(_AIX) && defined(__clang__)
+            __vector int  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#else
+            __vector int32_t  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#endif
             IntegerVector = vec_sub(IntegerBVector, ZeroPointBVector);
             ValueBVector = vec_mul(ScaleBVector, vec_ctf(IntegerVector, 0));
         }
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
index db3b9ee656592..ec5cdbc75220a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
@@ -155,7 +155,7 @@ namespace
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLen16(
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
     const float* QuantBScaleColPtr,
@@ -177,11 +177,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -191,6 +193,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -212,13 +218,11 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         // TODO handling only 16 elements per accumulator at a time here, probably can do better
         {
-            // load A
-            const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
-
             // load B
             const uint8x8_t bv_packed_col0 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x8_t bv_packed_col1 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -242,24 +246,55 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
             bv_col0 = vsubq_s8(bv_col0, vdupq_n_s8(bzp_col0));
             bv_col1 = vsubq_s8(bv_col1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(dot00, av_row0, bv_col0);
-            dot01 = vdotq_s32(dot01, av_row0, bv_col1);
-            dot10 = vdotq_s32(dot10, av_row1, bv_col0);
-            dot11 = vdotq_s32(dot11, av_row1, bv_col1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(int32x4_t{}, av_row0, bv_col0);
+                const int32x4_t dot01 = vdotq_s32(int32x4_t{}, av_row0, bv_col1);
+                const int32x4_t dot10 = vdotq_s32(int32x4_t{}, av_row1, bv_col0);
+                const int32x4_t dot11 = vdotq_s32(int32x4_t{}, av_row1, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row3 = vld1q_s8(QuantADataPtrRow3 + 0);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(int32x4_t{}, av_row2, bv_col0);
+                const int32x4_t dot21 = vdotq_s32(int32x4_t{}, av_row2, bv_col1);
+                const int32x4_t dot30 = vdotq_s32(int32x4_t{}, av_row3, bv_col0);
+                const int32x4_t dot31 = vdotq_s32(int32x4_t{}, av_row3, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
         }
 
         // increment block pointers
@@ -273,22 +308,30 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16(
     size_t BlkLen,
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
@@ -312,11 +355,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -326,6 +371,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -347,14 +396,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; ++sub_blk_idx) {
-            // load A
-            const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
-            const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
-            const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
-
             // load B
             const uint8x16_t bv_packed_col0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x16_t bv_packed_col1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -372,28 +417,65 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
             bv_col1_0 = vsubq_s8(bv_col1_0, vdupq_n_s8(bzp_col1));
             bv_col1_1 = vsubq_s8(bv_col1_1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(vdotq_s32(dot00, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
-            dot01 = vdotq_s32(vdotq_s32(dot01, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
-            dot10 = vdotq_s32(vdotq_s32(dot10, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
-            dot11 = vdotq_s32(vdotq_s32(dot11, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
+                const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
+                const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
+                const int32x4_t dot01 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
+                const int32x4_t dot10 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
+                const int32x4_t dot11 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2_0 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row2_1 = vld1q_s8(QuantADataPtrRow2 + 16);
+                const int8x16_t av_row3_0 = vld1q_s8(QuantADataPtrRow3 + 0);
+                const int8x16_t av_row3_1 = vld1q_s8(QuantADataPtrRow3 + 16);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col0_0), av_row2_1, bv_col0_1);
+                const int32x4_t dot21 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col1_0), av_row2_1, bv_col1_1);
+                const int32x4_t dot30 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col0_0), av_row3_1, bv_col0_1);
+                const int32x4_t dot31 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col1_0), av_row3_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
 
             // increment block data pointers to next sub-block
             QuantADataPtrRow0 += 32;
             QuantADataPtrRow1 += 32;
+            QuantADataPtrRow2 += 32;
+            QuantADataPtrRow3 += 32;
             QuantBDataPtr += 16;
         }
 
@@ -407,16 +489,24 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
@@ -478,8 +568,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv1 = vsubq_s8(bv1, bzp1);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
-        const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
+        const int32x4_t dot1 = vdotq_s32(int32x4_t{}, av1, bv1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -527,7 +617,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv0 = vsubq_s8(bv0, bzp0);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -604,9 +694,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi1 = vsubq_s8(bv_hi1, bzp1);
 
         // quantized dot product
-        int32x4_t dot0{}, dot1{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
-        dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo1, bv_lo1), av_hi1, bv_hi1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -652,8 +741,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi0 = vsubq_s8(bv_hi0, bzp0);
 
         // quantized dot product
-        int32x4_t dot0{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -736,9 +824,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32(
             bv3 = vsubq_s8(bv3, bzp);
 
             // quantized dot product
-            int32x4_t dot0{}, dot1{};
-            dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1);
-            dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3);
+            const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av0, bv0), av1, bv1);
+            const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av2, bv2), av3, bv3);
 
             // convert to float
             const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -834,7 +921,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -845,8 +932,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLen16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLen16<HasZeroPoint>(
                 QuantARowPtr,
                 QuantBDataColPtr,
                 QuantBScaleColPtr,
@@ -871,38 +958,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -932,6 +1011,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -964,7 +1051,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -975,8 +1062,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1002,38 +1089,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1063,6 +1142,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -1095,7 +1182,7 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1106,8 +1193,8 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1133,40 +1220,31 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
+                    BlkLen,
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1197,6 +1275,14 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index b88f2d6a4637e..08066f030a381 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -126,7 +126,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<float>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* q_weight = q_initializer.data<MLFloat16>();
     const MLFloat16* k_weight = k_initializer.data<MLFloat16>();
@@ -138,7 +138,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   return graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
index 913f3b6811183..86a7a4d6afbf8 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
@@ -188,7 +188,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph,
               "The total count of dims does not match the size of values. ",
               "total_count: ", total_count, " values.size(): ", values.size());
 
-  const_tensor.set_raw_data(values.data(), values.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t));
   return &graph_utils::AddInitializer(graph, const_tensor);
 }
 
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index 9df300d6f4f88..1466de51d0b99 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -82,8 +82,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) {
     shape_constant.set_name(constant_arg_out->Name());
     shape_constant.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
     shape_constant.add_dims(clamped_slice_length);
-    shape_constant.set_raw_data(dim_values.data() + start,
-                                clamped_slice_length * sizeof(int64_t));
+    utils::SetRawDataInTensorProto(shape_constant, dim_values.data() + start, clamped_slice_length * sizeof(int64_t));
     ONNX_NAMESPACE::TensorShapeProto result_shape;
     result_shape.add_dim()->set_dim_value(clamped_slice_length);
     constant_arg_out->SetShape(result_shape);
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 7b6f829b7a0a4..e8e395678436e 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -465,15 +465,13 @@ static NodeArg* ExtractEmbedding(Graph& graph,
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* data = old_initializer.data<MLFloat16>();
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc
index 2b29473f876c3..46f306b92bed5 100644
--- a/onnxruntime/core/optimizer/nchwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nchwc_transformer.cc
@@ -428,7 +428,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
     nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
     nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-    nchwc_conv_W_tensor_proto.set_raw_data(reordered_filter.data(), reordered_filter.size() * sizeof(float));
+    utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, reordered_filter.data(),
+                                   reordered_filter.size() * sizeof(float));
 
     nchwc_conv_W_tensor_proto.add_dims(nchwc_output_channels);
     nchwc_conv_W_tensor_proto.add_dims(filter_input_channels);
@@ -458,7 +459,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
       nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
       nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-      nchwc_conv_B_tensor_proto.set_raw_data(aligned_bias.data(), gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
+      utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, aligned_bias.data(),
+                                     gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
 
       nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels);
 
@@ -883,7 +885,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_W_tensor_proto;
   nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_scale"));
-  nchwc_conv_W_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_W_tensor_proto.add_dims(nchwc_channels);
   nchwc_conv_W_tensor_proto.add_dims(1);
   nchwc_conv_W_tensor_proto.add_dims(1);
@@ -896,7 +899,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_B_tensor_proto;
   nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_B"));
-  nchwc_conv_B_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_B_tensor_proto.add_dims(nchwc_channels);
 
   auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
index 6f0f38b1de56e..18e462c04dff3 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
@@ -129,7 +129,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   weights_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8");
   weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims());
-  weights_proto_u8.set_raw_data(w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
+  utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
   input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8);
 
   ONNX_NAMESPACE::TensorProto weight_zp_proto_u8;
@@ -140,7 +140,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8");
   r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims());
-  r_proto_u8.set_raw_data(r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
+  utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
   input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8);
 
   ONNX_NAMESPACE::TensorProto r_zp_proto_u8;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
index 199fbffc9f723..f2033dcbc1b03 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
@@ -60,7 +60,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) {
   ONNX_NAMESPACE::TensorProto zp_tensor_proto_u8;
   zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion"));
-  zp_tensor_proto_u8.set_raw_data(&q_zp_value, sizeof(uint8_t));
+  utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t));
   NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8);
 
   auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant");
diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
index 6caa35ea61ed7..1c1341fe5a127 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
@@ -27,7 +27,7 @@ inline bool Int8TensorProto2Uint8(
   if (nullptr == src) {
     uint8_t zero_val = 128;
     dst.set_name(graph.GenerateNodeArgName("weight_zp_s8_2_u8"));
-    dst.set_raw_data(&zero_val, sizeof(uint8_t));
+    utils::SetRawDataInTensorProto(dst, &zero_val, sizeof(uint8_t));
     return true;
   }
 
@@ -58,7 +58,7 @@ inline bool Int8TensorProto2Uint8(
     p++;
   }
   if (force || should_convert) {
-    dst.set_raw_data(temp.data<int8_t>(), size_t(temp.size()));
+    utils::SetRawDataInTensorProto(dst, temp.data<int8_t>(), size_t(temp.size()));
     return true;
   }
   return false;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 3d2a81ce7f8cd..3497ea4c85523 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -5,6 +5,7 @@
 
 #include "core/optimizer/qdq_transformer/qdq_util.h"
 #include "core/graph/node_attr_utils.h"
+#include "core/framework/tensorprotoutils.h"
 namespace onnxruntime {
 namespace QDQ {
 
@@ -132,7 +133,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(int8_t));
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(int8_t));
 
     return tensor_proto;
   };
@@ -145,8 +146,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(uint8_t));
-
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(uint8_t));
     return tensor_proto;
   };
   static ONNX_NAMESPACE::TensorProto GetOptionalZeroPointInt8() {
diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc
index 7768a835d5042..7f94e18458be2 100644
--- a/onnxruntime/core/optimizer/reshape_fusion.cc
+++ b/onnxruntime/core/optimizer/reshape_fusion.cc
@@ -435,7 +435,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo
   shape_initializer_proto.set_name(shape_def->Name());
   shape_initializer_proto.add_dims(static_cast<int64_t>(shape_value.size()));
   shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  shape_initializer_proto.set_raw_data(shape_value.data(), shape_value.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t));
   auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto);
 
   // Safely remove concat parent nodes which have only one output
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
index a54904ff15e1e..5c09e5225ab9c 100644
--- a/onnxruntime/core/optimizer/stft_decomposition.cc
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -45,7 +45,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T
     element_count *= shape[i];
     proto.add_dims(shape[i]);
   }
-  proto.set_raw_data(begin, element_count * sizeof(TDataType));
+  utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType));
   return &graph_utils::AddInitializer(graph, proto);
 }
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index 1f7e54cb807ea..f756d01413eae 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -766,10 +766,10 @@ std::string_view ApiGraph::AddInitializer(api::DataType dtype, const std::vector
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_data_type(gsl::narrow_cast<int32_t>(dtype));
   tensor_proto.set_name(name);
-  tensor_proto.set_raw_data(data.data(), data.size());
   for (int64_t dim : shape) {
     tensor_proto.add_dims(dim);
   }
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size());
 
   const auto& node_arg = graph_utils::AddInitializer(graph_, tensor_proto);
   return node_arg.Name();
diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h
index a9d89f32e91d3..fca8990f14821 100644
--- a/onnxruntime/core/platform/path_lib.h
+++ b/onnxruntime/core/platform/path_lib.h
@@ -281,7 +281,7 @@ void LoopDir(const std::string& dir_name, T func) {
   ORT_TRY {
     struct dirent* dp;
     while ((dp = readdir(dir)) != nullptr) {
-      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent<PATH_CHAR_TYPE>(dir_name, dp->d_name);
+      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent(dir_name, dp->d_name);
       if (stat(filename.c_str(), &stats) != 0) {
         continue;
       }
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index ec06320438977..04cf5ff6a3329 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#if !defined(_AIX)
 #include <sys/syscall.h>
+#endif
 #include <unistd.h>
 
 #include <iostream>
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 2a74e22850658..b0f9eaf4f62d2 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -98,6 +98,10 @@ ULONGLONG EtwRegistrationManager::Keyword() const {
   return keyword_;
 }
 
+HRESULT EtwRegistrationManager::Status() const {
+  return etw_status_;
+}
+
 void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
   std::lock_guard<OrtMutex> lock(callbacks_mutex_);
   callbacks_.push_back(&callback);
@@ -140,9 +144,15 @@ EtwRegistrationManager::EtwRegistrationManager() {
 }
 
 void EtwRegistrationManager::LazyInitialize() {
-  static HRESULT etw_status = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
-  if (FAILED(etw_status)) {
-    ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status));
+  if (!initialized_) {
+    std::lock_guard<OrtMutex> lock(init_mutex_);
+    if (!initialized_) {  // Double-check locking pattern
+      initialized_ = true;
+      etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+      if (FAILED(etw_status_)) {
+        ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_));
+      }
+    }
   }
 }
 
@@ -161,6 +171,12 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
   // register on first usage
   static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance();
 
+  // do something (not that meaningful) with etw_manager so it doesn't get optimized out
+  // as we want an instance around to do the unregister
+  if (FAILED(etw_manager.Status())) {
+    return;
+  }
+
   // TODO: Validate if this filtering makes sense.
   if (message.DataType() == DataType::USER) {
     return;
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index ff68aec0b7d64..3af45b813a625 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -66,6 +66,9 @@ class EtwRegistrationManager {
   // Get the current keyword
   uint64_t Keyword() const;
 
+  // Get the ETW registration status
+  HRESULT Status() const;
+
   void RegisterInternalCallback(const EtwInternalCallback& callback);
 
   void UnregisterInternalCallback(const EtwInternalCallback& callback);
@@ -97,6 +100,7 @@ class EtwRegistrationManager {
   bool is_enabled_;
   UCHAR level_;
   ULONGLONG keyword_;
+  HRESULT etw_status_;
 };
 
 }  // namespace logging
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index dee87ce3632a8..0e21715513707 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -26,6 +26,8 @@ class ActivationOpBuilder : public BaseOpBuilder {
                          const logging::Logger& logger) const override;
 
   int GetMinSupportedOpSet(const Node& node) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 void ActivationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
@@ -74,33 +76,61 @@ Status AddPReluWeight(ModelBuilder& model_builder, const Node& node,
 Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                   const Node& node,
                                                   const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   const auto& op_type(node.OpType());
-  if (op_type == "Sigmoid") {
-    layer->mutable_activation()->mutable_sigmoid();
-  } else if (op_type == "Tanh") {
-    layer->mutable_activation()->mutable_tanh();
-  } else if (op_type == "Relu") {
-    layer->mutable_activation()->mutable_relu();
-  } else if (op_type == "PRelu") {
-    auto* prelu = layer->mutable_activation()->mutable_prelu();
-    ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu));
-  } else if (op_type == "LeakyRelu") {
-    NodeAttrHelper helper(node);
-    const auto alpha = helper.Get("alpha", 0.01f);
-
-    auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu();
-    leaky_relu->set_alpha(alpha);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
+    std::string_view coreml_op_type;
+    if (op_type == "Sigmoid") {
+      coreml_op_type = "sigmoid";
+    } else if (op_type == "Tanh") {
+      coreml_op_type = "tanh";
+    } else if (op_type == "Relu") {
+      coreml_op_type = "relu";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+
+    if (op_type == "Sigmoid") {
+      layer->mutable_activation()->mutable_sigmoid();
+    } else if (op_type == "Tanh") {
+      layer->mutable_activation()->mutable_tanh();
+    } else if (op_type == "Relu") {
+      layer->mutable_activation()->mutable_relu();
+    } else if (op_type == "PRelu") {
+      auto* prelu = layer->mutable_activation()->mutable_prelu();
+      ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu));
+    } else if (op_type == "LeakyRelu") {
+      NodeAttrHelper helper(node);
+      const auto alpha = helper.Get("alpha", 0.01f);
+
+      auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu();
+      leaky_relu->set_alpha(alpha);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
+
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
 
-  model_builder.AddLayer(std::move(layer));
   return Status::OK();
 }
 
@@ -165,9 +195,20 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para
 bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
                                             const logging::Logger& logger) const {
   const auto& op_type = node.OpType();
-  if (op_type == "PRelu") {
-    return IsPReluOpSupported(node, input_params, logger);
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram) {
+    if (op_type == "PRelu" || op_type == "LeakyRelu") {
+      return false;
+    }
+  } else
+#endif  // (COREML_ENABLE_MLPROGRAM)
+  {
+    if (op_type == "PRelu") {
+      return IsPReluOpSupported(node, input_params, logger);
+    }
   }
+
   return true;
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index f6a61d55a3d63..831c4cf4d08ba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -14,13 +15,13 @@ namespace coreml {
 class TransposeOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                  const Node& node,
                                                  const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   NodeAttrHelper helper(node);
   std::vector<int64_t> perm = helper.Get("perm", std::vector<int64_t>());
   std::vector<int64_t> input_shape;
@@ -33,12 +34,27 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
   }
 
-  *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "transpose");
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationInput(*op, "perm", model_builder.AddConstant(op->type(), "perm", perm));
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
 
-  model_builder.AddLayer(std::move(layer));
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.cc
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.cc
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/core/providers/cuda/tensor/grid_sample.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.h
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h
diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
index 72193ef6268c1..94480c308b99f 100644
--- a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
@@ -60,17 +60,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
       HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
-#ifndef MIGRAPHX_STREAM_SYNC
-    if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
-      // copying from GPU to pinned memory, this is non-blocking
-      HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
-    } else {
-      // copying from GPU to CPU memory, this is blocking
-      HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
-    }
-#else
     HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
-#endif
   } else {
     // copying between cpu memory
     memcpy(dst_data, src_data, bytes);
diff --git a/onnxruntime/core/providers/migraphx/hip_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
similarity index 83%
rename from onnxruntime/core/providers/migraphx/hip_allocator.cc
rename to onnxruntime/core/providers/migraphx/migraphx_allocator.cc
index 53f10e318e65f..0693eea056416 100644
--- a/onnxruntime/core/providers/migraphx/hip_allocator.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc
@@ -3,7 +3,7 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "migraphx_call.h"
-#include "hip_allocator.h"
+#include "migraphx_allocator.h"
 #include "core/common/status.h"
 #include "core/framework/float16.h"
 #include "core/common/status.h"
@@ -11,7 +11,7 @@
 
 namespace onnxruntime {
 
-void HIPAllocator::CheckDevice() const {
+void MIGraphXAllocator::CheckDevice() const {
 #ifndef NDEBUG
   // check device to match at debug build
   // if it's expected to change, call hipSetDevice instead of the check
@@ -23,7 +23,7 @@ void HIPAllocator::CheckDevice() const {
 #endif
 }
 
-void* HIPAllocator::Alloc(size_t size) {
+void* MIGraphXAllocator::Alloc(size_t size) {
   CheckDevice();
   void* p = nullptr;
   if (size > 0) {
@@ -32,12 +32,12 @@ void* HIPAllocator::Alloc(size_t size) {
   return p;
 }
 
-void HIPAllocator::Free(void* p) {
+void MIGraphXAllocator::Free(void* p) {
   CheckDevice();
   (void)hipFree(p);  // do not throw error since it's OK for hipFree to fail during shutdown
 }
 
-void* HIPExternalAllocator::Alloc(size_t size) {
+void* MIGraphXExternalAllocator::Alloc(size_t size) {
   void* p = nullptr;
   if (size > 0) {
     p = alloc_(size);
@@ -49,7 +49,7 @@ void* HIPExternalAllocator::Alloc(size_t size) {
   return p;
 }
 
-void HIPExternalAllocator::Free(void* p) {
+void MIGraphXExternalAllocator::Free(void* p) {
   free_(p);
   std::lock_guard<OrtMutex> lock(lock_);
   auto it = reserved_.find(p);
@@ -59,7 +59,7 @@ void HIPExternalAllocator::Free(void* p) {
   }
 }
 
-void* HIPExternalAllocator::Reserve(size_t size) {
+void* MIGraphXExternalAllocator::Reserve(size_t size) {
   void* p = Alloc(size);
   if (!p) return nullptr;
   std::lock_guard<OrtMutex> lock(lock_);
diff --git a/onnxruntime/core/providers/migraphx/hip_allocator.h b/onnxruntime/core/providers/migraphx/migraphx_allocator.h
similarity index 78%
rename from onnxruntime/core/providers/migraphx/hip_allocator.h
rename to onnxruntime/core/providers/migraphx/migraphx_allocator.h
index 3244f9f04ea70..64da844e8c714 100644
--- a/onnxruntime/core/providers/migraphx/hip_allocator.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.h
@@ -9,12 +9,12 @@
 
 namespace onnxruntime {
 
-class HIPAllocator : public IAllocator {
+class MIGraphXAllocator : public IAllocator {
  public:
-  HIPAllocator(int device_id, const char* name)
+  MIGraphXAllocator(int device_id, const char* name)
       : IAllocator(
             OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id),
+                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(device_id)),
                           device_id, OrtMemTypeDefault)) {}
 
   virtual void* Alloc(size_t size) override;
@@ -24,14 +24,14 @@ class HIPAllocator : public IAllocator {
   void CheckDevice() const;
 };
 
-class HIPExternalAllocator : public HIPAllocator {
+class MIGraphXExternalAllocator : public MIGraphXAllocator {
   typedef void* (*ExternalAlloc)(size_t size);
   typedef void (*ExternalFree)(void* p);
   typedef void (*ExternalEmptyCache)();
 
  public:
-  HIPExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
-      : HIPAllocator(device_id, name) {
+  MIGraphXExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache)
+      : MIGraphXAllocator(device_id, name) {
     alloc_ = reinterpret_cast<ExternalAlloc>(alloc);
     free_ = reinterpret_cast<ExternalFree>(free);
     empty_cache_ = reinterpret_cast<ExternalEmptyCache>(empty_cache);
@@ -55,7 +55,7 @@ class HIPPinnedAllocator : public IAllocator {
   HIPPinnedAllocator(int device_id, const char* name)
       : IAllocator(
             OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id),
+                          OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, static_cast<OrtDevice::DeviceId>(device_id)),
                           device_id, OrtMemTypeCPUOutput)) {}
 
   virtual void* Alloc(size_t size) override;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_call.cc b/onnxruntime/core/providers/migraphx/migraphx_call.cc
index 5248ac2f39214..9807cd646e51c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_call.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_call.cc
@@ -1,10 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#ifdef _WIN32
+#include <winsock.h>
+#else
 #include <unistd.h>
-#include <string.h>
-#include <miopen/miopen.h>
-#include <rocblas/rocblas.h>
+#endif
+
+#include <string>
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/providers/shared_library/provider_api.h"
@@ -34,16 +37,20 @@ std::conditional_t<THRW, void, Status> RocmCall(
     ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line) {
   if (retCode != successCode) {
     try {
-      char hostname[HOST_NAME_MAX];
-      if (gethostname(hostname, HOST_NAME_MAX) != 0)
-        strcpy(hostname, "?");
+#ifdef _WIN32
+      // According to the POSIX spec, 255 is the safe minimum value.
+      static constexpr int HOST_NAME_MAX = 255;
+#endif
+      std::string hostname(HOST_NAME_MAX, 0);
+      if (gethostname(hostname.data(), HOST_NAME_MAX) != 0)
+        hostname = "?";
       int currentHipDevice;
       (void)hipGetDevice(&currentHipDevice);
       (void)hipGetLastError();  // clear last HIP error
       static char str[1024];
       snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; file=%s ; line=%d ; expr=%s; %s",
                libName, (int)retCode, RocmErrString(retCode), currentHipDevice,
-               hostname,
+               hostname.c_str(),
                file, line, exprString, msg);
       if constexpr (THRW) {
         // throw an exception with the error info
@@ -68,9 +75,5 @@ std::conditional_t<THRW, void, Status> RocmCall(
 
 template Status RocmCall<hipError_t, false>(hipError_t retCode, const char* exprString, const char* libName, hipError_t successCode, const char* msg, const char* file, const int line);
 template void RocmCall<hipError_t, true>(hipError_t retCode, const char* exprString, const char* libName, hipError_t successCode, const char* msg, const char* file, const int line);
-template Status RocmCall<rocblas_status, false>(rocblas_status retCode, const char* exprString, const char* libName, rocblas_status successCode, const char* msg, const char* file, const int line);
-template void RocmCall<rocblas_status, true>(rocblas_status retCode, const char* exprString, const char* libName, rocblas_status successCode, const char* msg, const char* file, const int line);
-template Status RocmCall<miopenStatus_t, false>(miopenStatus_t retCode, const char* exprString, const char* libName, miopenStatus_t successCode, const char* msg, const char* file, const int line);
-template void RocmCall<miopenStatus_t, true>(miopenStatus_t retCode, const char* exprString, const char* libName, miopenStatus_t successCode, const char* msg, const char* file, const int line);
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_call.h b/onnxruntime/core/providers/migraphx/migraphx_call.h
index 15d385a636b76..f6a95cebf34b5 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_call.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_call.h
@@ -4,8 +4,6 @@
 #pragma once
 #include "migraphx_inc.h"
 
-#pragma once
-
 namespace onnxruntime {
 
 // -----------------------------------------------------------------------
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 6ee85c3a4c047..097b16ecde536 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -13,12 +13,11 @@
 #include "core/common/logging/severity.h"
 #include "migraphx_execution_provider.h"
 #include "migraphx_execution_provider_utils.h"
-#include "hip_allocator.h"
+#include "migraphx_allocator.h"
 #include "gpu_data_transfer.h"
 #include "migraphx_inc.h"
 
-// TODO: find a better way to share this
-#include "core/providers/rocm/rocm_stream_handle.h"
+#include "migraphx_stream_handle.h"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245)
@@ -102,10 +101,10 @@ std::shared_ptr<KernelRegistry> MIGraphXExecutionProvider::GetKernelRegistry() c
 }
 
 MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, device_id_(info.device_id) {
+    : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_(info) {
   InitProviderOrtApi();
   // Set GPU device to be used
-  HIP_CALL_THROW(hipSetDevice(device_id_));
+  HIP_CALL_THROW(hipSetDevice(info_.device_id));
   t_ = migraphx::target(info.target_device.c_str());
 
   // whether fp16 is enable
@@ -181,16 +180,10 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
     dump_model_ops_ = (std::stoi(dump_model_ops_env) == 0 ? false : true);
   }
 
-  ROCBLAS_CALL_THROW(rocblas_create_handle(&external_rocblas_handle_));
-  ROCBLAS_CALL_THROW(rocblas_set_stream(external_rocblas_handle_, stream_));
-
-  MIOPEN_CALL_THROW(miopenCreate(&external_miopen_handle_));
-  MIOPEN_CALL_THROW(miopenSetStream(external_miopen_handle_, stream_));
-
   metadef_id_generator_ = ModelMetadefIdGenerator::Create();
 
   LOGS_DEFAULT(VERBOSE) << "[MIGraphX EP] MIGraphX provider options: "
-                        << "device_id: " << device_id_
+                        << "device_id: " << info_.device_id
                         << ", migraphx_fp16_enable: " << fp16_enable_
                         << ", migraphx_int8_enable: " << int8_enable_
                         << ", migraphx_int8_enable: " << int8_enable_
@@ -205,17 +198,14 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv
 }
 
 MIGraphXExecutionProvider::~MIGraphXExecutionProvider() {
-  ORT_IGNORE_RETURN_VALUE(ROCBLAS_CALL(rocblas_destroy_handle(external_rocblas_handle_)));
-  ORT_IGNORE_RETURN_VALUE(MIOPEN_CALL(miopenDestroy(external_miopen_handle_)));
 }
 
 std::vector<AllocatorPtr> MIGraphXExecutionProvider::CreatePreferredAllocators() {
   AllocatorCreationInfo default_memory_info(
-      [](OrtDevice::DeviceId device_id) { return CreateROCMAllocator(device_id, onnxruntime::CUDA); }, device_id_);
+      [](OrtDevice::DeviceId device_id) { return CreateMIGraphXAllocator(device_id, onnxruntime::CUDA); }, info_.device_id);
   AllocatorCreationInfo pinned_allocator_info(
       [](OrtDevice::DeviceId device_id) {
-        ORT_UNUSED_PARAMETER(device_id);
-        return CreateROCMPinnedAllocator(onnxruntime::CUDA_PINNED);
+        return CreateMIGraphXPinnedAllocator(device_id, onnxruntime::CUDA_PINNED);
       },
       0);
   return std::vector<AllocatorPtr>{CreateAllocator(default_memory_info), CreateAllocator(pinned_allocator_info)};
@@ -254,40 +244,40 @@ static bool getMIGraphXType(ONNXTensorElementDataType type,
                             migraphx_shape_datatype_t& mgx_type) {
   mgx_type = migraphx_shape_float_type;
   switch (type) {
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
       mgx_type = migraphx_shape_half_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
       mgx_type = migraphx_shape_float_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
       mgx_type = migraphx_shape_double_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
       mgx_type = migraphx_shape_int8_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
       mgx_type = migraphx_shape_int16_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
       mgx_type = migraphx_shape_int32_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
       mgx_type = migraphx_shape_int64_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
       mgx_type = migraphx_shape_uint8_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
       mgx_type = migraphx_shape_uint16_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
       mgx_type = migraphx_shape_uint32_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT64:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
       mgx_type = migraphx_shape_uint64_type;
       break;
-    case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
       mgx_type = migraphx_shape_bool_type;
       break;
     default:
@@ -303,7 +293,7 @@ std::vector<int> toVector(const ONNX_NAMESPACE::int64s& nums) {
   std::vector<int> result;
   int num = nums.size();
   for (int i = 0; i < num; ++i) {
-    result.push_back(nums[i]);
+    result.push_back(static_cast<int>(nums[i]));
   }
 
   return result;
@@ -501,16 +491,9 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co
     if (arg_s != nullptr) {
       const auto& tensor_dims = arg_s->dim();
       std::vector<std::size_t> dims;
-      std::transform(tensor_dims.begin(),
-                     tensor_dims.end(),
-                     std::back_inserter(dims),
-                     [&](auto&& d) -> std::size_t {
-                       if (d.has_dim_value()) {
-                         return d.dim_value();
-                       } else {
-                         return 0;
-                       }
-                     });
+      for (auto&& dim : tensor_dims) {
+        dims.emplace_back(dim.has_dim_value() ? dim.dim_value() : 0);
+      }
       if (dims == std::vector<std::size_t>{0}) {
         return true;
       }
@@ -546,8 +529,8 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co
 }
 
 void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::vector<std::vector<NodeIndex>>& clusters,
-                            const logging::Logger& logger) {
-  // Then check whether a subgraph should fallback to CPU
+                            [[maybe_unused]] const logging::Logger& logger) {
+  // Then check whether a subgraph should fall back to CPU
   // 1. Check whether a subgraph contains a RNN operator
   std::unordered_set<std::string> rnn_names = {"RNN", "GRU", "LSTM"};
   std::unordered_set<std::string> op_names = {"AveragePool", "Conv", "Gemm", "LRN", "MatMul", "MaxPool"};
@@ -591,17 +574,10 @@ void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::v
                   if (arg_s == nullptr) return false;
                   const auto& tensor_dims = arg_s->dim();
                   std::vector<std::size_t> dims;
-                  std::transform(tensor_dims.begin(),
-                                 tensor_dims.end(),
-                                 std::back_inserter(dims),
-                                 [&](auto&& d) -> std::size_t {
-                                   if (d.has_dim_value()) {
-                                     return d.dim_value();
-                                   } else {
-                                     return 1;
-                                   }
-                                 });
-                  return (std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<std::size_t>{}) > 300);
+                  for (auto&& dim : tensor_dims) {
+                    dims.emplace_back(dim.has_dim_value() ? dim.dim_value() : 1);
+                  }
+                  return (std::accumulate(dims.begin(), dims.end(), 1ULL, std::multiplies<std::size_t>{}) > 300);
                 })) {
               return false;
             }
@@ -623,7 +599,7 @@ void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::v
 static bool IsNodeSupported(const std::set<std::string>& op_set,
                             const onnxruntime::GraphViewer& graph_viewer,
                             const NodeIndex node_idx,
-                            const logging::Logger& logger) {
+                            [[maybe_unused]] const logging::Logger& logger) {
   const auto& node = graph_viewer.GetNode(node_idx);
   const auto& optype = node->OpType();
   const auto& domain = node->Domain();
@@ -1442,14 +1418,10 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
         // lock to avoid race condition
         std::lock_guard<OrtMutex> lock(*(mgx_state->mgx_mu_ptr));
 
-#ifdef MIGRAPHX_STREAM_SYNC
         void* rocm_stream;
         Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &rocm_stream));
         auto prog_outputs = prog.run_async(m, static_cast<hipStream_t>(rocm_stream));
-#else
-        auto prog_outputs = prog.eval(m);
-        HIP_CALL_THROW(hipDeviceSynchronize());
-#endif
+
         // In case of input parameters are reused as output parameter call hipMemcpy
         auto output_num = prog_outputs.size();
         if (prog_output_indices.size() < output_num) {
@@ -1478,8 +1450,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
 void MIGraphXExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry,
                                                        AllocatorMap& allocators) const {
   auto allocator = allocators[GetOrtDeviceByMemType(OrtMemTypeCPU)];
-  RegisterRocmStreamHandles(stream_handle_registry, OrtDevice::GPU, allocator, true, stream_,
-                            false /*TODO:external_stream_*/, external_miopen_handle_, external_rocblas_handle_);
+  RegisterMIGraphXStreamHandles(stream_handle_registry, OrtDevice::GPU, allocator, true, stream_, false /*TODO:external_stream_*/);
 }
 
 OrtDevice MIGraphXExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
@@ -1487,7 +1458,6 @@ OrtDevice MIGraphXExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type)
   if (mem_type == OrtMemTypeCPUOutput) return OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, 0 /*CPU device id always be 0*/);
   return default_device_;
 }
-#ifdef MIGRAPHX_STREAM_SYNC
 
 Status MIGraphXExecutionProvider::Sync() const {
   HIP_CALL_THROW(hipStreamSynchronize(static_cast<hipStream_t>(nullptr)));
@@ -1512,5 +1482,4 @@ Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxrunti
   return Status::OK();
 }
 
-#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index 1977f71b8b1cf..f34ca320d0a5a 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -3,9 +3,6 @@
 
 #pragma once
 
-#include <miopen/miopen.h>
-#include <rocblas/rocblas.h>
-
 #include "core/framework/arena_extend_strategy.h"
 #include "core/framework/execution_provider.h"
 #include "core/platform/ort_mutex.h"
@@ -14,8 +11,6 @@
 
 #include <map>
 #include <unordered_map>
-// TODO: find a better way to share this
-// #include "core/providers/cuda/rocm_stream_handle.h"
 
 namespace onnxruntime {
 
@@ -62,13 +57,11 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   explicit MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info);
   ~MIGraphXExecutionProvider();
 
-#ifdef MIGRAPHX_STREAM_SYNC
   Status Sync() const override;
 
   Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
   Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
-#endif
 
   std::vector<std::unique_ptr<ComputeCapability>>
   GetCapability(const onnxruntime::GraphViewer& graph_viewer,
@@ -85,7 +78,13 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
+  int GetDeviceId() const override { return info_.device_id; }
+  ProviderOptions GetProviderOptions() const override {
+    return MIGraphXExecutionProviderInfo::ToProviderOptions(info_);
+  }
+
  private:
+  MIGraphXExecutionProviderInfo info_;
   bool fp16_enable_ = false;
   bool int8_enable_ = false;
   std::string int8_calibration_cache_name_;
@@ -98,7 +97,6 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   bool load_compiled_model_ = false;
   std::string load_compiled_path_;
   bool dump_model_ops_ = false;
-  int device_id_;
   migraphx::target t_;
   OrtMutex mgx_mu_;
   hipStream_t stream_ = nullptr;
@@ -109,8 +107,6 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, bool> map_no_input_shape_;
 
   AllocatorPtr allocator_;
-  miopenHandle_t external_miopen_handle_ = nullptr;
-  rocblas_handle external_rocblas_handle_ = nullptr;
   std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
 };
 
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
index 8411e3eef096b..68d5d9af98ea4 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h
@@ -14,7 +14,7 @@ namespace onnxruntime {
 // Information needed to construct trt execution providers.
 struct MIGraphXExecutionProviderInfo {
   std::string target_device;
-  int device_id{0};
+  OrtDevice::DeviceId device_id{0};
   bool fp16_enable{false};
   bool int8_enable{false};
   std::string int8_calibration_table_name{""};
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h
index 071070e92a209..9274b5696185c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h
@@ -28,7 +28,7 @@ bool IsGraphInput(const GraphViewer& graph, const std::string& name) {
   return (std::find(input_names.begin(), input_names.end(), name) != input_names.end());
 }
 
-bool IsGraphInitializer(const GraphViewer& graph, const std::string& name, bool check_outer_scope = true) {
+bool IsGraphInitializer(const GraphViewer& graph, const std::string& name, [[maybe_unused]] bool check_outer_scope = true) {
   const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
   return graph.GetInitializedTensor(name, initializer);
 }
diff --git a/onnxruntime/core/providers/migraphx/migraphx_inc.h b/onnxruntime/core/providers/migraphx/migraphx_inc.h
index 96b24051ace76..2b035b20f619f 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_inc.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_inc.h
@@ -4,5 +4,5 @@
 #pragma once
 
 #include <hip/hip_runtime.h>
-#include <migraphx/migraphx.h>
+#include <iso646.h>
 #include <migraphx/migraphx.hpp>
diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
index dd24dbdc76d2f..6d199930116e8 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc
@@ -6,7 +6,7 @@
 #include "core/providers/migraphx/migraphx_provider_factory.h"
 #include "migraphx_execution_provider.h"
 #include "migraphx_provider_factory_creator.h"
-#include "hip_allocator.h"
+#include "migraphx_allocator.h"
 #include "gpu_data_transfer.h"
 #include "core/framework/provider_options.h"
 
@@ -33,10 +33,23 @@ std::unique_ptr<IExecutionProvider> MIGraphXProviderFactory::CreateProvider() {
   return std::make_unique<MIGraphXExecutionProvider>(info_);
 }
 
+struct ProviderInfo_MIGraphX_Impl final : ProviderInfo_MIGraphX {
+  std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) override {
+    return std::make_unique<MIGraphXAllocator>(device_id, name);
+  }
+
+  std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) override {
+    return std::make_unique<HIPPinnedAllocator>(device_id, name);
+  }
+
+} g_info;
+
 struct MIGraphX_Provider : Provider {
+  void* GetInfo() override { return &g_info; }
+
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
     MIGraphXExecutionProviderInfo info;
-    info.device_id = device_id;
+    info.device_id = static_cast<OrtDevice::DeviceId>(device_id);
     info.target_device = "gpu";
     return std::make_shared<MIGraphXProviderFactory>(info);
   }
@@ -44,7 +57,7 @@ struct MIGraphX_Provider : Provider {
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* provider_options) override {
     auto& options = *reinterpret_cast<const OrtMIGraphXProviderOptions*>(provider_options);
     MIGraphXExecutionProviderInfo info;
-    info.device_id = options.device_id;
+    info.device_id = static_cast<OrtDevice::DeviceId>(options.device_id);
     info.target_device = "gpu";
     info.fp16_enable = options.migraphx_fp16_enable;
     info.int8_enable = options.migraphx_int8_enable;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
index ac9834e64942a..b257a4318dc0e 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h
@@ -10,4 +10,13 @@ struct IExecutionProviderFactory;
 struct MIGraphXExecutionProviderInfo;
 enum class ArenaExtendStrategy : int32_t;
 struct MIGraphXExecutionProviderExternalAllocatorInfo;
+
+struct ProviderInfo_MIGraphX {
+  virtual std::unique_ptr<onnxruntime::IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<onnxruntime::IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) = 0;
+
+ protected:
+  ~ProviderInfo_MIGraphX() = default;  // Can only be destroyed through a subclass instance
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
new file mode 100644
index 0000000000000..9c5bb4ecf5c97
--- /dev/null
+++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc
@@ -0,0 +1,171 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <core/providers/rocm/rocm_resource.h>
+#include "migraphx_stream_handle.h"
+
+namespace onnxruntime {
+
+struct MIGraphXNotification : public synchronize::Notification {
+  MIGraphXNotification(Stream& s) : Notification(s) {
+    HIP_CALL_THROW(hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+  }
+
+  ~MIGraphXNotification() {
+    if (event_)
+      HIP_CALL_THROW(hipEventDestroy(event_));
+  }
+
+  void Activate() override {
+    // record event with hipEventBlockingSync so we can support sync on host without busy wait.
+    HIP_CALL_THROW(hipEventRecord(event_, static_cast<hipStream_t>(stream_.GetHandle())));
+  }
+
+  void wait_on_device(Stream& device_stream) {
+    ORT_ENFORCE(device_stream.GetDevice().Type() == OrtDevice::GPU, "Unexpected device:", device_stream.GetDevice().ToString());
+    // launch a wait command to the migraphx stream
+    HIP_CALL_THROW(hipStreamWaitEvent(static_cast<hipStream_t>(device_stream.GetHandle()), event_, 0));
+  };
+
+  void wait_on_host() {
+    // CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
+    HIP_CALL_THROW(hipEventSynchronize(event_));
+  }
+
+  hipEvent_t event_;
+};
+
+MIGraphXStream::MIGraphXStream(hipStream_t stream,
+                               const OrtDevice& device,
+                               AllocatorPtr cpu_allocator,
+                               bool release_cpu_buffer_on_migraphx_stream)
+    : Stream(stream, device),
+      cpu_allocator_(cpu_allocator),
+      release_cpu_buffer_on_migraphx_stream_(release_cpu_buffer_on_migraphx_stream) {
+}
+
+MIGraphXStream::~MIGraphXStream() {
+  ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd());
+  if (own_stream_) {
+    auto* handle = GetHandle();
+    if (handle)
+      HIP_CALL_THROW(hipStreamDestroy(static_cast<hipStream_t>(handle)));
+  }
+}
+
+std::unique_ptr<synchronize::Notification> MIGraphXStream::CreateNotification(size_t /*num_consumers*/) {
+  return std::make_unique<MIGraphXNotification>(*this);
+}
+
+void MIGraphXStream::Flush() {
+  if (own_stream_)
+    HIP_CALL_THROW(hipStreamSynchronize(static_cast<hipStream_t>(GetHandle())));
+}
+
+void MIGraphXStream::EnqueDeferredCPUBuffer(void* cpu_buffer) {
+  // stream is per thread, so don't need lock
+  deferred_cpu_buffers_.push_back(cpu_buffer);
+}
+
+struct CpuBuffersInfo {
+  // This struct stores the information needed
+  // to release CPU buffers allocated for GPU kernels.
+  // It's used to enqueue their release after
+  // associated GPU kernels in a MIGraphX stream.
+
+  // This is a CPU allocator in MIGraphX EP.
+  // It must be the one used to allocate the
+  // following pointers.
+  AllocatorPtr allocator;
+  // buffers[i] is the i-th pointer added by
+  // AddDeferredReleaseCPUPtr for a specific
+  // MIGraphX stream. For example, this fields
+  // should contain all values in
+  // deferred_release_buffer_pool_[my_stream]
+  // when release my_stream's buffers.
+  std::unique_ptr<void*[]> buffers;
+  // CPU buffer buffers[i].
+  // Number of buffer points in "buffers".
+  size_t n_buffers;
+};
+
+static void ReleaseCpuBufferCallback(void* raw_info) {
+  std::unique_ptr<CpuBuffersInfo> info = std::make_unique<CpuBuffersInfo>();
+  info.reset(reinterpret_cast<CpuBuffersInfo*>(raw_info));
+  for (size_t i = 0; i < info->n_buffers; ++i) {
+    info->allocator->Free(info->buffers[i]);
+  }
+}
+
+Status MIGraphXStream::CleanUpOnRunEnd() {
+  if (deferred_cpu_buffers_.empty())
+    return Status::OK();
+  // Release the ownership of cpu_buffers_info so that the underlying
+  // object will keep alive until the end of ReleaseCpuBufferCallback.
+  if (release_cpu_buffer_on_migraphx_stream_ && cpu_allocator_->Info().alloc_type == OrtArenaAllocator) {
+    std::unique_ptr<CpuBuffersInfo> cpu_buffers_info = std::make_unique<CpuBuffersInfo>();
+    cpu_buffers_info->allocator = cpu_allocator_;
+    cpu_buffers_info->buffers = std::make_unique<void*[]>(deferred_cpu_buffers_.size());
+    for (size_t i = 0; i < deferred_cpu_buffers_.size(); ++i) {
+      cpu_buffers_info->buffers[i] = deferred_cpu_buffers_.at(i);
+    }
+    cpu_buffers_info->n_buffers = deferred_cpu_buffers_.size();
+    HIP_RETURN_IF_ERROR(hipLaunchHostFunc(static_cast<hipStream_t>(GetHandle()), ReleaseCpuBufferCallback, cpu_buffers_info.release()));
+  } else {
+    HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast<hipStream_t>(GetHandle())));
+    for (auto* buffer : deferred_cpu_buffers_) {
+      cpu_allocator_->Free(buffer);
+    }
+  }
+
+  deferred_cpu_buffers_.clear();
+  return Status::OK();
+}
+
+void* MIGraphXStream::GetResource(int version, int id) const {
+  ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!");
+  void* resource{};
+  switch (id) {
+    case RocmResource::hip_stream_t:
+      return reinterpret_cast<void*>(GetHandle());
+    default:
+      break;
+  }
+  return resource;
+}
+
+// CPU Stream command handles
+void WaitMIGraphXNotificationOnDevice(Stream& stream, synchronize::Notification& notification) {
+  static_cast<MIGraphXNotification*>(&notification)->wait_on_device(stream);
+}
+
+void WaitMIGraphXNotificationOnHost(Stream& /*stream*/, synchronize::Notification& notification) {
+  static_cast<MIGraphXNotification*>(&notification)->wait_on_host();
+}
+
+void RegisterMIGraphXStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
+                                   const OrtDevice::DeviceType device_type,
+                                   AllocatorPtr cpu_allocator,
+                                   bool release_cpu_buffer_on_migraphx_stream,
+                                   hipStream_t external_stream,
+                                   bool use_existing_stream) {
+  // wait migraphx notification on migraphx ep
+  stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitMIGraphXNotificationOnDevice);
+  // wait migraphx notification on cpu ep
+  stream_handle_registry.RegisterWaitFn(device_type, OrtDevice::CPU, WaitMIGraphXNotificationOnHost);
+  if (!use_existing_stream)
+    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_migraphx_stream](const OrtDevice& device) {
+      HIP_CALL_THROW(hipSetDevice(device.Id()));
+      hipStream_t stream = nullptr;
+      HIP_CALL_THROW(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+      return std::make_unique<MIGraphXStream>(stream, device, cpu_allocator, release_cpu_buffer_on_migraphx_stream);
+    });
+  else
+    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator,
+                                                                release_cpu_buffer_on_migraphx_stream,
+                                                                external_stream](const OrtDevice& device) {
+      return std::make_unique<MIGraphXStream>(external_stream, device, cpu_allocator, release_cpu_buffer_on_migraphx_stream);
+    });
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h
new file mode 100644
index 0000000000000..03a7c1607e3ad
--- /dev/null
+++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/framework/stream_handles.h"
+#include "migraphx_inc.h"
+#include "migraphx_call.h"
+
+#define HIP_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(HIP_CALL(expr))
+
+namespace onnxruntime {
+void WaitMIGraphXNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
+
+struct MIGraphXStream : Stream {
+  MIGraphXStream(hipStream_t stream,
+                 const OrtDevice& device,
+                 AllocatorPtr cpu_allocator,
+                 bool release_cpu_buffer_on_migraphx_stream);
+
+  ~MIGraphXStream();
+
+  std::unique_ptr<synchronize::Notification> CreateNotification(size_t /*num_consumers*/) override;
+
+  void Flush() override;
+
+  Status CleanUpOnRunEnd() override;
+
+  void EnqueDeferredCPUBuffer(void* cpu_buffer);
+
+  bool own_stream_{true};
+
+  virtual void* GetResource(int version, int id) const;
+
+  virtual WaitNotificationFn GetWaitNotificationFn() const { return WaitMIGraphXNotificationOnDevice; }
+
+ private:
+  std::vector<void*> deferred_cpu_buffers_;
+  AllocatorPtr cpu_allocator_;
+  bool release_cpu_buffer_on_migraphx_stream_{true};
+};
+
+void RegisterMIGraphXStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
+                                   const OrtDevice::DeviceType device_type,
+                                   AllocatorPtr cpu_allocator,
+                                   bool release_cpu_buffer_on_migraphx_stream,
+                                   hipStream_t external_stream,
+                                   bool use_existing_stream);
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 590bddabdba54..2f54a04e15304 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -108,6 +108,7 @@ struct NodeProto;
 struct SparseTensorProto;
 struct StringStringEntryProto;
 struct StringStringEntryProtos;  // RepeatedPtrField
+struct OperatorSetIdProto;
 struct TensorProto;
 struct TensorProtos;  // RepeatedPtrField
 struct TensorShapeProto_Dimension;
@@ -120,6 +121,7 @@ struct TypeProto_Sequence;
 struct TypeProto;
 struct ValueInfoProto;
 struct ValueInfoProtos;  // RepeatedPtrField
+struct FunctionProto;
 struct InferenceContext;
 class GraphInferencer;
 using InferenceFunction = std::function<void(InferenceContext&)>;
@@ -146,6 +148,7 @@ struct ConfigOptions;
 struct DataTransferManager;
 struct IndexedSubGraph;
 struct IndexedSubGraph_MetaDef;
+enum class IndexedSubGraph_SourceOfSchema : uint8_t;
 struct KernelCreateInfo;
 struct KernelDef;
 struct KernelDefBuilder;
@@ -279,6 +282,9 @@ std::unique_ptr<IAllocator> CreateCPUAllocator(const OrtMemoryInfo& memory_info)
 std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name);
 std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(const char* name);
 
+std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name);
+std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name);
+
 std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name);
 std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name);
 
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 6e6a80f097c12..7fb9fd3c8cfd5 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -353,16 +353,12 @@ std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() {
 #endif
 
 #ifdef USE_MIGRAPHX
-std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) {
-  return g_host->CreateROCMAllocator(device_id, name);
+std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateMIGraphXAllocator(device_id, name);
 }
 
-std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name) {
-  return g_host->CreateROCMPinnedAllocator(name);
-}
-
-std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() {
-  return g_host->CreateGPUDataTransfer();
+std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) {
+  return g_host->CreateMIGraphXPinnedAllocator(device_id, name);
 }
 #endif
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index bc6dac1a2f27f..382b3ac932520 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -179,6 +179,11 @@ struct ProviderHost {
   virtual void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
 #endif
 
+#ifdef USE_MIGRAPHX
+  virtual std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0;
+  virtual std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) = 0;
+#endif
+
 #ifdef USE_ROCM
   virtual std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) = 0;
   virtual std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name) = 0;
@@ -299,6 +304,11 @@ struct ProviderHost {
   virtual int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) = 0;
   virtual ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) = 0;
 
+  // OperatorSetIdProto
+  virtual std::string* OperatorSetIdProto__mutable_domain(ONNX_NAMESPACE::OperatorSetIdProto* p) = 0;
+  virtual void OperatorSetIdProto__set_version(ONNX_NAMESPACE::OperatorSetIdProto* p, int64_t version) = 0;
+  virtual int64_t OperatorSetIdProto__version(const ONNX_NAMESPACE::OperatorSetIdProto* p) = 0;
+
 #if !defined(DISABLE_OPTIONAL_TYPE)
   // TypeProto_Optional
   virtual const ONNX_NAMESPACE::TypeProto& TypeProto_Optional__elem_type(const ONNX_NAMESPACE::TypeProto_Optional* p) = 0;
@@ -415,6 +425,11 @@ struct ProviderHost {
   virtual void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) = 0;
   virtual ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) = 0;
 
+  virtual const ONNX_NAMESPACE::OperatorSetIdProto& ModelProto__opset_import(const ONNX_NAMESPACE::ModelProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__mutable_opset_import(ONNX_NAMESPACE::ModelProto* p, int index) = 0;
+  virtual int ModelProto__opset_import_size(const ONNX_NAMESPACE::ModelProto* p) = 0;
+  virtual ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__add_opset_import(ONNX_NAMESPACE::ModelProto* p) = 0;
+
   // NodeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::NodeProto> NodeProto__construct() = 0;
   virtual void NodeProto__operator_delete(ONNX_NAMESPACE::NodeProto* p) = 0;
@@ -422,6 +437,7 @@ struct ProviderHost {
   virtual int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) = 0;
   virtual const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const = 0;
   virtual ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::AttributeProto* NodeProto__add_attribute(ONNX_NAMESPACE::NodeProto* p) = 0;
 
   // TensorProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() = 0;
@@ -490,6 +506,64 @@ struct ProviderHost {
 
   virtual const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) = 0;
 
+  // FunctionProto
+  virtual std::unique_ptr<ONNX_NAMESPACE::FunctionProto> FunctionProto__construct() = 0;
+  virtual void FunctionProto__operator_delete(ONNX_NAMESPACE::FunctionProto* p) = 0;
+
+  virtual bool FunctionProto__SerializeToString(const ONNX_NAMESPACE::FunctionProto* p, std::string& string) = 0;
+  virtual bool FunctionProto__SerializeToOstream(const ONNX_NAMESPACE::FunctionProto* p, std::ostream& output) = 0;
+  virtual bool FunctionProto__ParseFromString(ONNX_NAMESPACE::FunctionProto* p, const std::string& data) = 0;
+  virtual std::string FunctionProto__SerializeAsString(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+
+  virtual bool FunctionProto__has_name(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual const std::string& FunctionProto__name(const ONNX_NAMESPACE::FunctionProto* p) const = 0;
+  virtual void FunctionProto__set_name(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& name) = 0;
+
+  virtual bool FunctionProto__has_doc_string(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual const std::string& FunctionProto__doc_string(const ONNX_NAMESPACE::FunctionProto* p) const = 0;
+  virtual void FunctionProto__set_doc_string(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& doc_string) = 0;
+
+  virtual bool FunctionProto__has_domain(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual const std::string& FunctionProto__domain(const ONNX_NAMESPACE::FunctionProto* p) const = 0;
+  virtual void FunctionProto__set_domain(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& domain) = 0;
+
+  virtual const std::string& FunctionProto__input(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual std::string* FunctionProto__mutable_input(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__input_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual void FunctionProto__add_input(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0;
+
+  virtual const std::string& FunctionProto__output(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual std::string* FunctionProto__mutable_output(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__output_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual void FunctionProto__add_output(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0;
+
+  virtual const std::string& FunctionProto__attribute(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual std::string* FunctionProto__mutable_attribute(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__attribute_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual void FunctionProto__add_attribute(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0;
+
+  virtual const ONNX_NAMESPACE::AttributeProto& FunctionProto__attribute_proto(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::AttributeProto* FunctionProto__mutable_attribute_proto(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__attribute_proto_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::AttributeProto* FunctionProto__add_attribute_proto(ONNX_NAMESPACE::FunctionProto* p) = 0;
+
+  virtual const ONNX_NAMESPACE::NodeProto& FunctionProto__node(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::NodeProto* FunctionProto__mutable_node(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__node_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::NodeProto* FunctionProto__add_node(ONNX_NAMESPACE::FunctionProto* p) = 0;
+
+  virtual const ONNX_NAMESPACE::ValueInfoProto& FunctionProto__value_info(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::ValueInfoProtos* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::ValueInfoProto* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__value_info_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::ValueInfoProto* FunctionProto__add_value_info(ONNX_NAMESPACE::FunctionProto* p) = 0;
+
+  virtual const ONNX_NAMESPACE::StringStringEntryProto& FunctionProto__metadata_props(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProtos* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p, int index) = 0;
+  virtual int FunctionProto__metadata_props_size(const ONNX_NAMESPACE::FunctionProto* p) = 0;
+  virtual ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__add_metadata_props(ONNX_NAMESPACE::FunctionProto* p) = 0;
+
   virtual void RegisterSchema(const std::string& domain, const OrtCustomOp* op, int type) = 0;
 
   // ConfigOptions
@@ -541,6 +615,9 @@ struct ProviderHost {
   virtual void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) = 0;
   virtual const IndexedSubGraph_MetaDef* IndexedSubGraph__GetMetaDef(const IndexedSubGraph* p) = 0;
 
+  virtual void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) = 0;
+  virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0;
+
   // KernelDef
   virtual void KernelDef__operator_delete(KernelDef* p) = 0;
   virtual int KernelDef__ExecQueueId(const KernelDef* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index fb3b274d9b80b..de6c1da1d6430 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -80,6 +80,15 @@ struct StringStringEntryProtos final {
 
   PROVIDER_DISALLOW_ALL(StringStringEntryProtos)
 };
+
+struct OperatorSetIdProto final {
+  std::string* mutable_domain() { return g_host->OperatorSetIdProto__mutable_domain(this); }
+  void set_version(int64_t version) { return g_host->OperatorSetIdProto__set_version(this, version); }
+  int64_t version() { return g_host->OperatorSetIdProto__version(this); }
+
+  PROVIDER_DISALLOW_ALL(OperatorSetIdProto)
+};
+
 struct AttributeProto final {
   static std::unique_ptr<AttributeProto> Create() { return g_host->AttributeProto__construct(); }
   void operator=(const AttributeProto& v) { g_host->AttributeProto__operator_assign(this, v); }
@@ -178,6 +187,11 @@ struct ModelProto final {
 
   void set_ir_version(int64_t value) { return g_host->ModelProto__set_ir_version(this, value); }
 
+  const OperatorSetIdProto& opset_import(int index) const { return g_host->ModelProto__opset_import(this, index); }
+  OperatorSetIdProto* mutable_opset_import(int index) { return g_host->ModelProto__mutable_opset_import(this, index); }
+  int opset_import_size() const { return g_host->ModelProto__opset_import_size(this); }
+  OperatorSetIdProto* add_opset_import() { return g_host->ModelProto__add_opset_import(this); }
+
   ModelProto() = delete;
   ModelProto(const ModelProto&) = delete;
   void operator=(const ModelProto&) = delete;
@@ -190,6 +204,7 @@ struct NodeProto final {
   int attribute_size() { return g_host->NodeProto__attribute_size(this); }
   const AttributeProto& attribute(int index) const { return g_host->NodeProto__attribute(this, index); }
   AttributeProto* mutable_attribute(int index) { return g_host->NodeProto__mutable_attribute(this, index); }
+  AttributeProto* add_attribute() { return g_host->NodeProto__add_attribute(this); }
 
   NodeProto() = delete;
   NodeProto(const NodeProto&) = delete;
@@ -372,6 +387,69 @@ struct ValueInfoProtos final {
 
   PROVIDER_DISALLOW_ALL(ValueInfoProtos)
 };
+
+struct FunctionProto final {
+  static std::unique_ptr<FunctionProto> Create() { return g_host->FunctionProto__construct(); }
+  static void operator delete(void* p) { g_host->FunctionProto__operator_delete(reinterpret_cast<FunctionProto*>(p)); }
+
+  bool SerializeToString(std::string& string) const { return g_host->FunctionProto__SerializeToString(this, string); }
+  bool SerializeToOstream(std::ostream& output) const { return g_host->FunctionProto__SerializeToOstream(this, output); }
+  bool ParseFromString(const std::string& data) { return g_host->FunctionProto__ParseFromString(this, data); }
+  std::string SerializeAsString() const { return g_host->FunctionProto__SerializeAsString(this); }
+
+  bool has_name() const { return g_host->FunctionProto__has_name(this); }
+  const std::string& name() const { return g_host->FunctionProto__name(this); }
+  void set_name(const std::string& name) { g_host->FunctionProto__set_name(this, name); }
+
+  bool has_doc_string() const { return g_host->FunctionProto__has_doc_string(this); }
+  const std::string& doc_string() const { return g_host->FunctionProto__doc_string(this); }
+  void set_doc_string(const std::string& doc_string) { g_host->FunctionProto__set_doc_string(this, doc_string); }
+
+  bool has_domain() const { return g_host->FunctionProto__has_domain(this); }
+  const std::string& domain() const { return g_host->FunctionProto__domain(this); }
+  void set_domain(const std::string& domain) { g_host->FunctionProto__set_domain(this, domain); }
+
+  const std::string& input(int index) const { return g_host->FunctionProto__input(this, index); }
+  std::string* mutable_input(int index) { return g_host->FunctionProto__mutable_input(this, index); }
+  int input_size() const { return g_host->FunctionProto__input_size(this); }
+  void add_input(const std::string& value) { g_host->FunctionProto__add_input(this, value); }
+
+  const std::string& output(int index) const { return g_host->FunctionProto__output(this, index); }
+  std::string* mutable_output(int index) { return g_host->FunctionProto__mutable_output(this, index); }
+  int output_size() const { return g_host->FunctionProto__output_size(this); }
+  void add_output(const std::string& value) { g_host->FunctionProto__add_output(this, value); }
+
+  const std::string& attribute(int index) const { return g_host->FunctionProto__attribute(this, index); }
+  std::string* mutable_attribute(int index) { return g_host->FunctionProto__mutable_attribute(this, index); }
+  int attribute_size() const { return g_host->FunctionProto__attribute_size(this); }
+  void add_attribute(const std::string& value) { g_host->FunctionProto__add_attribute(this, value); }
+
+  const AttributeProto& attribute_proto(int index) const { return g_host->FunctionProto__attribute_proto(this, index); }
+  AttributeProto* mutable_attribute_proto(int index) { return g_host->FunctionProto__mutable_attribute_proto(this, index); }
+  int attribute_proto_size() const { return g_host->FunctionProto__attribute_proto_size(this); }
+  AttributeProto* add_attribute_proto() { return g_host->FunctionProto__add_attribute_proto(this); }
+
+  const NodeProto& node(int index) const { return g_host->FunctionProto__node(this, index); }
+  NodeProto* mutable_node(int index) { return g_host->FunctionProto__mutable_node(this, index); }
+  int node_size() const { return g_host->FunctionProto__node_size(this); }
+  NodeProto* add_node() { return g_host->FunctionProto__add_node(this); }
+
+  const ValueInfoProto& value_info(int index) const { return g_host->FunctionProto__value_info(this, index); }
+  ValueInfoProtos* mutable_value_info() { return g_host->FunctionProto__mutable_value_info(this); }
+  ValueInfoProto* mutable_value_info(int index) { return g_host->FunctionProto__mutable_value_info(this, index); }
+  int value_info_size() const { return g_host->FunctionProto__value_info_size(this); }
+  ValueInfoProto* add_value_info() { return g_host->FunctionProto__add_value_info(this); }
+
+  const StringStringEntryProto& metadata_props(int index) const { return g_host->FunctionProto__metadata_props(this, index); }
+  StringStringEntryProtos* mutable_metadata_props() { return g_host->FunctionProto__mutable_metadata_props(this); }
+  StringStringEntryProto* mutable_metadata_props(int index) { return g_host->FunctionProto__mutable_metadata_props(this, index); }
+  int metadata_props_size() const { return g_host->FunctionProto__metadata_props_size(this); }
+  StringStringEntryProto* add_metadata_props() { return g_host->FunctionProto__add_metadata_props(this); }
+
+  FunctionProto() = delete;
+  FunctionProto(const FunctionProto&) = delete;
+  void operator=(const FunctionProto&) = delete;
+};
 }  // namespace ONNX_NAMESPACE
 
 namespace onnxruntime {
@@ -449,6 +527,12 @@ struct IndexedSubGraph_MetaDef final {
   void operator=(const IndexedSubGraph_MetaDef&) = delete;
 };
 
+enum class IndexedSubGraph_SourceOfSchema : uint8_t {
+  CREATE,
+  REUSE_OR_CREATE,
+  EXISTING,
+};
+
 struct IndexedSubGraph final {
   static std::unique_ptr<IndexedSubGraph> Create() { return g_host->IndexedSubGraph__construct(); }
   static void operator delete(void* p) { g_host->IndexedSubGraph__operator_delete(reinterpret_cast<IndexedSubGraph*>(p)); }
@@ -458,6 +542,9 @@ struct IndexedSubGraph final {
   void SetMetaDef(std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) { return g_host->IndexedSubGraph__SetMetaDef(this, std::move(*reinterpret_cast<std::unique_ptr<IndexedSubGraph_MetaDef>*>(&meta_def_))); }
   const IndexedSubGraph_MetaDef* GetMetaDef() const { return reinterpret_cast<const IndexedSubGraph_MetaDef*>(g_host->IndexedSubGraph__GetMetaDef(this)); }
 
+  void SetSchemaSource(IndexedSubGraph_SourceOfSchema schema_source) { return g_host->IndexedSubGraph__SetSchemaSource(this, schema_source); }
+  IndexedSubGraph_SourceOfSchema GetSchemaSource() const { return g_host->IndexedSubGraph__GetSchemaSource(this); }
+
   IndexedSubGraph() = delete;
   IndexedSubGraph(const IndexedSubGraph&) = delete;
   void operator=(const IndexedSubGraph&) = delete;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 8a601c156bd0a..67cbc8f5d6f13 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -70,7 +70,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
     const std::string tensor_name = network.getInput(i)->getName();
     auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
     if (dynamic_range_iter != dynamic_range_map.end()) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       if (!network.getInput(i)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
         LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for network input " << tensor_name;
         return false;
       }
@@ -84,7 +91,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
       const std::string tensor_name = trt_layer->getOutput(j)->getName();
       auto dynamic_range_iter = dynamic_range_map.find(tensor_name);
       if (dynamic_range_iter != dynamic_range_map.end()) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         if (!trt_layer->getOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for tensor " << tensor_name;
           return false;
         }
@@ -122,7 +136,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
           }
           max_weight = std::max(max_weight, std::abs(weight));
         }
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         if (!trt_layer->getOutput(j)->setDynamicRange(static_cast<float>(-max_weight), static_cast<float>(max_weight))) {
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for layer " << const_layer_name;
           return false;
         }
@@ -2232,7 +2253,14 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
         auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
 
         auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         trt_parser->supportsModel(string_buf.data(), string_buf.size(), parser_nodes_list, model_path_);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
         SubGraphCollection_t next_nodes_list;
         const std::vector<NodeIndex>& subgraph_node_index = graph_viewer->GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);
@@ -3074,7 +3102,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       } else {
         // Set INT8 per tensor dynamic range
         if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
           trt_config->setInt8Calibrator(nullptr);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
           if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
             return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                    "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
@@ -3193,7 +3228,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Note: Creating an execution context from an engine is thread safe per TRT doc
     // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
     if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > max_ctx_mem_size_) {
         max_ctx_mem_size_ = mem_size;
       }
@@ -3466,7 +3508,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
       // Set INT8 Per Tensor Dynamic range
       if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
         trt_config->setInt8Calibrator(nullptr);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
         if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
         }
@@ -3734,7 +3783,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
     // Set execution context memory
     if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > *max_context_mem_size_ptr) {
         *max_context_mem_size_ptr = mem_size;
       }
@@ -3865,7 +3921,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
   // Note: Creating an execution context from an engine is thread safe per TRT doc
   // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
   if (context_memory_sharing_enable_) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
     size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
     if (mem_size > max_ctx_mem_size_) {
       max_ctx_mem_size_ = mem_size;
     }
@@ -4038,7 +4101,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
 
     // Set execution context memory
     if (trt_state->context_memory_sharing_enable) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
       size_t mem_size = trt_engine->getDeviceMemorySize();
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
       if (mem_size > *max_context_mem_size_ptr) {
         *max_context_mem_size_ptr = mem_size;
       }
diff --git a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
new file mode 100644
index 0000000000000..ab31aa313cf6d
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc
@@ -0,0 +1,682 @@
+// Standard headers/libs.
+#include <fstream>
+#include <sstream>
+#include <cctype>
+#include <cstring>
+
+// 3rd-party headers/libs.
+#include <nlohmann/json.hpp>
+
+#include "ep_context_utils.h"
+
+namespace onnxruntime {
+
+constexpr const char* kVitisAI = "vitisai";
+
+std::unique_ptr<ONNX_NAMESPACE::FunctionProto> ConvertIndexedSubGraphToFunctionProto(
+    const IndexedSubGraph& sub_graph, const Graph& parent_graph) {
+  auto p_func_proto = ONNX_NAMESPACE::FunctionProto::Create();
+  auto* p_meta_def = const_cast<IndexedSubGraph_MetaDef*>(sub_graph.GetMetaDef());
+  if (p_meta_def) {
+    p_func_proto->set_name(p_meta_def->name());
+    p_func_proto->set_domain(p_meta_def->domain());
+    for (const auto& input : p_meta_def->inputs()) {
+      p_func_proto->add_input(input);
+    }
+    auto* p_metadata_props_0 = p_func_proto->add_metadata_props();
+    *(p_metadata_props_0->mutable_key()) = "meta_def_inputs_size";
+    *(p_metadata_props_0->mutable_value()) = std::to_string(p_meta_def->inputs().size());
+    for (const auto& output : p_meta_def->outputs()) {
+      p_func_proto->add_output(output);
+    }
+    // XXX: SerDes with different fields.
+    for (const auto& initializer : p_meta_def->constant_initializers()) {
+      p_func_proto->add_input(initializer);
+    }
+    // XXX: SerDes with different numbers of fields.
+    for (const auto& attr_pair : p_meta_def->attributes()) {
+      p_func_proto->add_attribute(attr_pair.first);
+      auto* p_attr_proto = p_func_proto->add_attribute_proto();
+      *p_attr_proto = attr_pair.second;
+    }
+    p_func_proto->set_doc_string(p_meta_def->doc_string());
+    // "since_version"
+    auto* p_metadata_props_1 = p_func_proto->add_metadata_props();
+    *(p_metadata_props_1->mutable_key()) = "meta_def_since_version";
+    *(p_metadata_props_1->mutable_value()) = std::to_string(p_meta_def->since_version());
+    // "status"
+    auto* p_metadata_props_2 = p_func_proto->add_metadata_props();
+    *(p_metadata_props_2->mutable_key()) = "meta_def_status";
+    *(p_metadata_props_2->mutable_value()) =
+        std::to_string(static_cast<int>(p_meta_def->status()));
+    // TODO: `MetaDef::type_and_shape_inference_function`.
+  }
+  auto p_parent_graph_proto = parent_graph.ToGraphProto();
+  for (auto node_index : const_cast<IndexedSubGraph&>(sub_graph).Nodes()) {
+    auto* p_node_proto = p_parent_graph_proto->mutable_node(static_cast<int>(node_index));
+    auto* p_attr_proto = p_node_proto->add_attribute();
+    p_attr_proto->set_name("parent_graph_node_index");
+    p_attr_proto->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+    p_attr_proto->set_i(node_index);
+    *(p_func_proto->add_node()) = *p_node_proto;
+  }
+#if 0
+  // Alternative.
+  for (const auto node_index : sub_graph.Nodes()) {
+    const auto* p_node = parent_graph.GetNode(node_index);
+    auto p_node_proto = ONNX_NAMESPACE::NodeProto::Create();
+    // XXX
+    p_node->ToProto(*p_node_proto, true);
+    auto* p_attr_proto = p_node_proto->add_attribute();
+    p_attr_proto->set_name("parent_graph_node_index");
+    p_attr_proto->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+    p_attr_proto->set_i(node_index);
+    *(p_func_proto.add_node()) = *p_node_proto;
+  }
+#endif
+  auto* p_metadata_props_3 = p_func_proto->add_metadata_props();
+  *(p_metadata_props_3->mutable_key()) = "schema_source";
+  *(p_metadata_props_3->mutable_value()) =
+      std::to_string(static_cast<uint8_t>(sub_graph.GetSchemaSource()));
+  return p_func_proto;
+}
+
+std::unique_ptr<IndexedSubGraph> ConvertFunctionProtoToIndexedSubGraph(
+    const std::unique_ptr<ONNX_NAMESPACE::FunctionProto>& p_func_proto) {
+  auto p_isg = IndexedSubGraph::Create();
+  // "meta_def_inputs_size" (optional) and "schema_source".
+  int func_metadata_props_size = p_func_proto->metadata_props_size();
+  // Precisely, func_metadata_props_size == 4, which implies
+  // `IndexedSubGraph::meta_def_` is not null and `IndexedSubGraph::nodes` > 1.
+  if (func_metadata_props_size > 1) {
+    auto& prop0 = const_cast<ONNX_NAMESPACE::StringStringEntryProto&>(p_func_proto->metadata_props(0));
+    int isg_meta_def_inputs_size = std::stoi(*(prop0.mutable_value()));
+    auto p_meta_def = IndexedSubGraph_MetaDef::Create();
+    p_meta_def->name() = p_func_proto->name();
+    p_meta_def->domain() = p_func_proto->domain();
+    auto& prop1 = const_cast<ONNX_NAMESPACE::StringStringEntryProto&>(p_func_proto->metadata_props(1));
+    p_meta_def->since_version() = std::stoi(*(prop1.mutable_value()));
+    auto& prop2 = const_cast<ONNX_NAMESPACE::StringStringEntryProto&>(p_func_proto->metadata_props(2));
+    p_meta_def->status() = static_cast<ONNX_NAMESPACE::OperatorStatus>(std::stoi(*(prop2.mutable_value())));
+    auto& meta_def_inputs = p_meta_def->inputs();
+    for (int i = 0; i < isg_meta_def_inputs_size; i++) {
+      meta_def_inputs.push_back(p_func_proto->input(i));
+    }
+    auto& meta_def_outputs = p_meta_def->outputs();
+    for (int i = 0, l = p_func_proto->output_size(); i < l; i++) {
+      meta_def_outputs.push_back(p_func_proto->output(i));
+    }
+    auto& meta_def_initializers = p_meta_def->constant_initializers();
+    for (int i = isg_meta_def_inputs_size, l = p_func_proto->input_size(); i < l; i++) {
+      meta_def_initializers.push_back(p_func_proto->input(i));
+    }
+    auto& meta_def_attrs = p_meta_def->attributes();
+    for (int i = 0, l = p_func_proto->attribute_size(); i < l; i++) {
+      meta_def_attrs.emplace(p_func_proto->attribute(i), p_func_proto->attribute_proto(i));
+    }
+    p_meta_def->doc_string() = p_func_proto->doc_string();
+    // TODO: `IndexedSubGraph::type_and_shape_inference_function`.
+    p_isg->SetMetaDef(std::move(p_meta_def));
+  }
+  auto& isg_nodes = p_isg->Nodes();
+  for (int i = 0, l = p_func_proto->node_size(); i < l; i++) {
+    const auto& node_proto = p_func_proto->node(i);
+    isg_nodes.push_back(
+        node_proto.attribute(const_cast<ONNX_NAMESPACE::NodeProto&>(node_proto).attribute_size() - 1).i());
+  }
+  auto schema_source = static_cast<IndexedSubGraph_SourceOfSchema>(
+      std::stoi(*(const_cast<ONNX_NAMESPACE::StringStringEntryProto&>(p_func_proto->metadata_props(func_metadata_props_size - 1)).mutable_value())));
+  p_isg->SetSchemaSource(schema_source);
+  return p_isg;
+}
+
+std::string SerializeCapabilities(
+    const std::vector<std::unique_ptr<ComputeCapability>>& capability_ptrs,
+    const Graph& graph) {
+  std::stringstream ss;
+  for (const auto& p : capability_ptrs) {
+    auto& p_subgraph = p->SubGraph();
+    auto p_func_proto = ConvertIndexedSubGraphToFunctionProto(*p_subgraph, graph);
+    std::string func_proto_buf;
+    p_func_proto->SerializeToString(func_proto_buf);
+    size_t buf_len = func_proto_buf.length();
+    ss.write(reinterpret_cast<const char*>(&buf_len), sizeof(buf_len));
+    ss.write(func_proto_buf.data(), buf_len);
+  }
+  if (!ss.good()) {
+    ORT_THROW("Serialization stream bad");
+  }
+  return ss.str();
+}
+
+void DeserializeCapabilities(const std::string& ser_capabilities,
+                             std::vector<std::unique_ptr<ComputeCapability>>& capability_ptrs) {
+  std::istringstream ss(ser_capabilities);
+  while (!ss.eof()) {
+    size_t buf_len;
+    ss.read(reinterpret_cast<char*>(&buf_len), sizeof(buf_len));
+    std::string buf(buf_len, '\0');
+    ss.read(&buf[0], buf_len);
+    auto p_func_proto = ONNX_NAMESPACE::FunctionProto::Create();
+    p_func_proto->ParseFromString(buf);
+    auto p_subgraph = ConvertFunctionProtoToIndexedSubGraph(p_func_proto);
+    capability_ptrs.push_back(ComputeCapability::Create(std::move(p_subgraph)));
+  }
+}
+
+std::string SerializeOrigialGraph(const GraphViewer& graph_viewer) {
+  // XXX: Will Steps 1/2/3 suffice for restoring a model/graph later?
+  // Any information loss or mismatch?
+  // Step 1
+  const Graph& orig_graph = graph_viewer.GetGraph();
+  // Step 2
+  const Model& orig_model = orig_graph.GetModel();
+  // Step 3
+  auto p_orig_model_proto = const_cast<Model&>(orig_model).ToProto();
+  if (p_orig_model_proto->opset_import_size() == 0) {
+    for (const auto& it : graph_viewer.DomainToVersionMap()) {
+      auto* p_opset_import = p_orig_model_proto->add_opset_import();
+      *(p_opset_import->mutable_domain()) = it.first;
+      p_opset_import->set_version(it.second);
+    }
+  }
+
+  nlohmann::json j_obj;
+  if (p_orig_model_proto->opset_import_size() > 0) {
+    for (int i = 0, n = p_orig_model_proto->opset_import_size(); i < n; ++i) {
+      auto& op_set_id_proto = const_cast<ONNX_NAMESPACE::OperatorSetIdProto&>(p_orig_model_proto->opset_import(i));
+      j_obj[*op_set_id_proto.mutable_domain()] = std::to_string(op_set_id_proto.version());
+    }
+  }
+  j_obj["orig_graph_name"] = graph_viewer.Name();
+  // TODO: platform dependency (Linux vs Windows).
+  j_obj["orig_model_path"] = graph_viewer.ModelPath().string();
+
+  // XXX: `ModelProto::SerializeToString` will lose some info,
+  // e.g., ModelProto.opset_import.
+  std::string ser_buf;
+  p_orig_model_proto->SerializeToString(ser_buf);
+  j_obj["orig_model_proto_ser_str"] = ser_buf;
+
+  return j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace);
+}
+
+// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc".
+ONNX_NAMESPACE::ModelProto* CreateEPContexModel(
+    const GraphViewer& graph_viewer,
+    const std::string& serialized_ctx_cache,
+    const std::string& ctx_cache_file_loc,
+    const int64_t embed_mode,
+    const std::string& backend_cache_dir,
+    const std::string& backend_cache_key,
+    bool saving_orig_graph,
+    const logging::Logger* p_logger) {
+  LOGS_DEFAULT(VERBOSE) << "[VitisAI EP]Creating EP context node";
+  // Create a new graph/model, reusing the graph name,
+  // the op-domain-to-opset-version map,
+  // and the op schema registry of the current graph.
+  // XXX: This approach (immediately below) has a memory fault issue (std::bad_alloc).
+  // auto& ep_ctx_graph = graph_viewer.CreateModel(*p_logger)->MainGraph();
+  // This apporach (immediately below) has no memory falut issue.
+  auto p_temp_model = graph_viewer.CreateModel(*p_logger);
+  auto& ep_ctx_graph = p_temp_model->MainGraph();
+
+  const auto& graph_inputs = graph_viewer.GetInputs();
+  std::vector<NodeArg*> input_node_arg_ptrs;
+  input_node_arg_ptrs.reserve(graph_inputs.size());
+  // XXX: vs `GraphViewer::GetInputsIncludingInitializers()`.
+  for (const auto* p_node_arg : graph_inputs) {
+    auto& temp_node_arg = ep_ctx_graph.GetOrCreateNodeArg(
+        p_node_arg->Name(), p_node_arg->TypeAsProto());
+    input_node_arg_ptrs.push_back(&temp_node_arg);
+  }
+  const auto& graph_outputs = graph_viewer.GetOutputs();
+  std::vector<NodeArg*> output_node_arg_ptrs;
+  output_node_arg_ptrs.reserve(graph_outputs.size());
+  for (const auto* p_node_arg : graph_outputs) {
+    auto& temp_node_arg = ep_ctx_graph.GetOrCreateNodeArg(p_node_arg->Name(), p_node_arg->TypeAsProto());
+    output_node_arg_ptrs.push_back(&temp_node_arg);
+  }
+
+  // Attr "embed_mode".
+  auto p_attr_0 = ONNX_NAMESPACE::AttributeProto::Create();
+  p_attr_0->set_name(kEmbedModeAttr);
+  // p_attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
+  p_attr_0->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+  p_attr_0->set_i(embed_mode);
+  // Attr "ep_cache_context".
+  auto p_attr_1 = ONNX_NAMESPACE::AttributeProto::Create();
+  p_attr_1->set_name(kEPCacheContextAttr);
+  // p_attr_1->set_type(onnx::AttributeProto_AttributeType_STRING);
+  p_attr_1->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+  // Relative to the ONNX model file.
+  p_attr_1->set_s(
+      embed_mode == 0 ? fs::path(ctx_cache_file_loc).filename().string() : serialized_ctx_cache);
+  // Attr "source".
+  auto p_attr_2 = ONNX_NAMESPACE::AttributeProto::Create();
+  p_attr_2->set_name(kSourceAttr);
+  // p_attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+  p_attr_2->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+  p_attr_2->set_s(kVitisAIExecutionProvider);
+  // Attr "onnx_model_filename".
+  auto p_attr_3 = ONNX_NAMESPACE::AttributeProto::Create();
+  p_attr_3->set_name(kONNXModelFileNameAttr);
+  // p_attr_3->set_type(onnx::AttributeProto_AttributeType_STRING);
+  p_attr_3->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+  p_attr_3->set_s(graph_viewer.ModelPath().filename().string());
+  // Attr "notes".
+  auto p_attr_4 = ONNX_NAMESPACE::AttributeProto::Create();
+  p_attr_4->set_name(kNotesAttr);
+  // p_attr_4->set_type(onnx::AttributeProto_AttributeType_STRING);
+  p_attr_4->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+  // FIXME: 2G-limit of ProtoBuf.
+  if (saving_orig_graph) {
+    p_attr_4->set_s(SerializeOrigialGraph(graph_viewer));
+  } else {
+    nlohmann::json j_obj;
+    j_obj["backend_cache_dir"] = backend_cache_dir;
+    j_obj["backend_cache_key"] = backend_cache_key;
+    p_attr_4->set_s(j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace));
+  }
+
+  auto p_node_attrs = NodeAttributes::Create();
+  constexpr int num_attrs = 5;
+  p_node_attrs->reserve(num_attrs);
+  p_node_attrs->emplace(kEmbedModeAttr, *p_attr_0);
+  p_node_attrs->emplace(kEPCacheContextAttr, *p_attr_1);
+  p_node_attrs->emplace(kSourceAttr, *p_attr_2);
+  p_node_attrs->emplace(kONNXModelFileNameAttr, *p_attr_3);
+  p_node_attrs->emplace(kNotesAttr, *p_attr_4);
+
+  // Since we don't implement `IExecutionProvider::GetEpContextNodes()` and
+  // thus don't leverage `CreateEpContextModel()` in the file "graph_partitioner.cc",
+  // we specify a brand-new node name here.
+  ep_ctx_graph.AddNode(kEPContextOpName, kEPContextOp, "", input_node_arg_ptrs, output_node_arg_ptrs, p_node_attrs.get(), kEPContextOpDomain);
+
+  auto res_status = ep_ctx_graph.Resolve();
+  ORT_ENFORCE(res_status.IsOK(), res_status.ErrorMessage());
+  LOGS_DEFAULT(VERBOSE) << "Created EP context model graph resolved";
+
+  auto p_ep_ctx_graph_viewer = ep_ctx_graph.CreateGraphViewer();
+  auto p_temp_model_2 = p_ep_ctx_graph_viewer->CreateModel(*p_logger);
+  auto p_ep_ctx_model_proto = p_temp_model_2->ToProto();
+  p_ep_ctx_graph_viewer->ToProto(*p_ep_ctx_model_proto->mutable_graph(), true, true);
+  p_ep_ctx_model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  return p_ep_ctx_model_proto.release();
+}
+
+// Ref.: `static common::Status Save(Model& model, int fd)` in the file "model.h".
+void DumpEPContextModel(
+    const std::unique_ptr<ONNX_NAMESPACE::ModelProto>& p_model_proto, const std::string& ep_ctx_model_file_loc) {
+  std::fstream dump_stream(ep_ctx_model_file_loc, std::ios::out | std::ios::trunc | std::ios::binary);
+  p_model_proto->SerializeToOstream(dump_stream);
+  LOGS_DEFAULT(VERBOSE) << "[VitisAI EP] Dumped " << ep_ctx_model_file_loc;
+}
+
+const Node* GetEPContextNodePtr(const Graph& graph) {
+  // TODO: Support for multi-node EP context model.
+  for (const auto* p_node : graph.Nodes()) {
+    if (p_node->OpType() == kEPContextOp) {
+      return p_node;
+    }
+  }
+  return nullptr;
+}
+
+bool ValidateEPContextNode(const Graph& graph) {
+  // TODO: Support for multi-node EP context model.
+  const auto* p_node = GetEPContextNodePtr(graph);
+  assert(p_node != nullptr);
+  auto& attrs = p_node->GetAttributes();
+  assert(attrs.count(kEmbedModeAttr) > 0);
+  assert(attrs.count(kEPCacheContextAttr) > 0);
+  assert(attrs.count(kSourceAttr) > 0);
+  const auto& source_val = attrs.at(kSourceAttr).s();
+  if (source_val == kVitisAIExecutionProvider) {
+    return true;
+  }
+  size_t vitisai_len = std::strlen(kVitisAI);
+  assert(source_val.length() == vitisai_len);
+  for (size_t i = 0; i < vitisai_len; ++i) {
+    assert(static_cast<unsigned char>(std::tolower(source_val[i])) == kVitisAI[i]);
+  }
+  return true;
+}
+
+// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc".
+void CreateEPContexNodes(
+    Graph* p_ep_ctx_graph,
+    const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs,
+    const std::string& serialized_ctx_cache,
+    const std::string& ctx_cache_file_loc,
+    const int64_t embed_mode,
+    const std::string& backend_cache_dir,
+    const std::string& backend_cache_key,
+    bool saving_orig_graph,
+    const logging::Logger* p_logger) {
+  LOGS_DEFAULT(VERBOSE) << "[VitisAI EP]Creating EP context nodes";
+  int fused_index = 0;
+  for (const auto& fused_node_graph : fused_nodes_and_graphs) {
+    Node& fused_node = fused_node_graph.fused_node;
+    const auto& fused_name = fused_node.Name();
+    const GraphViewer& graph_viewer = fused_node_graph.filtered_graph;
+    // FIXME
+    const auto& graph_inputs = graph_viewer.GetInputs();
+    std::vector<NodeArg*> input_node_arg_ptrs;
+    input_node_arg_ptrs.reserve(graph_inputs.size());
+    // XXX: vs `GraphViewer::GetInputsIncludingInitializers()`.
+    for (const auto* p_node_arg : graph_inputs) {
+      auto& temp_node_arg = p_ep_ctx_graph->GetOrCreateNodeArg(
+          p_node_arg->Name(), p_node_arg->TypeAsProto());
+      input_node_arg_ptrs.push_back(&temp_node_arg);
+    }
+    const auto& graph_outputs = graph_viewer.GetOutputs();
+    std::vector<NodeArg*> output_node_arg_ptrs;
+    output_node_arg_ptrs.reserve(graph_outputs.size());
+    for (const auto* p_node_arg : graph_outputs) {
+      auto& temp_node_arg = p_ep_ctx_graph->GetOrCreateNodeArg(p_node_arg->Name(), p_node_arg->TypeAsProto());
+      output_node_arg_ptrs.push_back(&temp_node_arg);
+    }
+
+    auto p_node_attrs = NodeAttributes::Create();
+    if (fused_index == 0) {
+      p_node_attrs->reserve(7);
+      // Attr "ep_cache_context".
+      auto p_attr_1 = ONNX_NAMESPACE::AttributeProto::Create();
+      p_attr_1->set_name(kEPCacheContextAttr);
+      p_attr_1->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+      // Relative to the ONNX model file.
+      p_attr_1->set_s(
+          embed_mode == 0 ? fs::path(ctx_cache_file_loc).filename().string() : serialized_ctx_cache);
+      p_node_attrs->emplace(kEPCacheContextAttr, *p_attr_1);
+      // Attr "notes".
+      auto p_attr_4 = ONNX_NAMESPACE::AttributeProto::Create();
+      p_attr_4->set_name(kNotesAttr);
+      p_attr_4->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+      // FIXME: 2G-limit of ProtoBuf.
+      if (saving_orig_graph) {
+        p_attr_4->set_s(SerializeOrigialGraph(graph_viewer));
+      } else {
+        nlohmann::json j_obj;
+        j_obj["backend_cache_dir"] = backend_cache_dir;
+        j_obj["backend_cache_key"] = backend_cache_key;
+        p_attr_4->set_s(j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace));
+      }
+      p_node_attrs->emplace(kNotesAttr, *p_attr_4);
+      // Attr "main_context".
+      auto p_attr_5 = ONNX_NAMESPACE::AttributeProto::Create();
+      p_attr_5->set_name(kMainContextAttr);
+      p_attr_5->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+      p_attr_5->set_i(1);
+      p_node_attrs->emplace(kMainContextAttr, *p_attr_5);
+    } else {
+      p_node_attrs->reserve(5);
+      // Attr "main_context".
+      auto p_attr_5 = ONNX_NAMESPACE::AttributeProto::Create();
+      p_attr_5->set_name(kMainContextAttr);
+      p_attr_5->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+      p_attr_5->set_i(0);
+      p_node_attrs->emplace(kMainContextAttr, *p_attr_5);
+    }
+    // Attr "embed_mode".
+    auto p_attr_0 = ONNX_NAMESPACE::AttributeProto::Create();
+    p_attr_0->set_name(kEmbedModeAttr);
+    p_attr_0->set_type(ONNX_NAMESPACE::AttributeProto::INT);
+    p_attr_0->set_i(embed_mode);
+    p_node_attrs->emplace(kEmbedModeAttr, *p_attr_0);
+    // Attr "source".
+    auto p_attr_2 = ONNX_NAMESPACE::AttributeProto::Create();
+    p_attr_2->set_name(kSourceAttr);
+    p_attr_2->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+    p_attr_2->set_s(kVitisAIExecutionProvider);
+    p_node_attrs->emplace(kSourceAttr, *p_attr_2);
+    // Attr "onnx_model_filename".
+    auto p_attr_3 = ONNX_NAMESPACE::AttributeProto::Create();
+    p_attr_3->set_name(kONNXModelFileNameAttr);
+    p_attr_3->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+    p_attr_3->set_s(graph_viewer.ModelPath().filename().string());
+    p_node_attrs->emplace(kONNXModelFileNameAttr, *p_attr_3);
+    // Attr "partition_name".
+    auto p_attr_6 = ONNX_NAMESPACE::AttributeProto::Create();
+    p_attr_6->set_name(kPartitionNameAttr);
+    p_attr_6->set_type(ONNX_NAMESPACE::AttributeProto::STRING);
+    p_attr_6->set_s(fused_name);
+    p_node_attrs->emplace(kPartitionNameAttr, *p_attr_6);
+
+    p_ep_ctx_graph->AddNode(fused_name, kEPContextOp, "", input_node_arg_ptrs, output_node_arg_ptrs, p_node_attrs.get(), kEPContextOpDomain);
+
+    ++fused_index;
+  }
+  auto res_status = p_ep_ctx_graph->Resolve();
+  ORT_ENFORCE(res_status.IsOK(), res_status.ErrorMessage());
+  LOGS_DEFAULT(VERBOSE) << "Created EP context model graph resolved";
+}
+
+std::string RetrieveEPContextCache(
+    const Graph& graph, const PathString& ep_ctx_model_loc, bool binary_mode) {
+  // TODO: Support for multi-node EP context model.
+  const auto* p_node = GetEPContextNodePtr(graph);
+  const auto& attrs = p_node->GetAttributes();
+  int64_t embed_mode = attrs.at(kEmbedModeAttr).i();
+  const std::string& ep_ctx_cache = attrs.at(kEPCacheContextAttr).s();
+  if (embed_mode) {
+    return ep_ctx_cache;
+  }
+  fs::path ep_ctx_fs_path(ep_ctx_model_loc);
+  // Attr "ep_cache_context" stores a relative path.
+  ep_ctx_fs_path.replace_filename(fs::path(ep_ctx_cache));
+  // TODO: Validaion of the file location to make sure security is met.
+  if (!fs::exists(ep_ctx_fs_path) || !fs::is_regular_file(ep_ctx_fs_path)) {
+    ORT_THROW("File for EP context cache is missing");
+  }
+  auto open_mode = binary_mode ? (std::ios::in | std::ios::binary) : std::ios::in;
+  std::ifstream ifs(ep_ctx_fs_path.string().c_str(), open_mode);
+  if (!ifs.is_open()) {
+    ORT_THROW("Exception opening EP context cache file");
+  }
+  ifs.seekg(0, ifs.end);
+  std::streampos cache_len = ifs.tellg();
+  if (cache_len == -1) {
+    ifs.close();
+    ORT_THROW("Error when operating EP context cache file");
+  } else if (cache_len == 0) {
+    ifs.close();
+    LOGS_DEFAULT(WARNING) << "Empty EP context cache file: " << ep_ctx_fs_path.string();
+    return "";
+  }
+  ifs.seekg(0, ifs.beg);
+  char* buf = new char[static_cast<size_t>(cache_len)];
+  ifs.read(buf, cache_len);
+  if (!ifs.good()) {
+    ifs.close();
+    ORT_THROW("Exception reading EP context cache file");
+  }
+  ifs.close();
+  std::string cache_payload(buf);
+  delete[] buf;
+  return cache_payload;
+}
+
+void RetrieveBackendCacheInfo(const Graph& graph, std::string& cache_dir, std::string& cache_key) {
+  // TODO: Support for multi-node EP context model.
+  const auto* p_node = GetEPContextNodePtr(graph);
+  if (p_node == nullptr) {
+    LOGS_DEFAULT(WARNING) << "Failed to retrieve cache info due to no EP context nodes";
+    return;
+  }
+  const auto& attrs = p_node->GetAttributes();
+  const auto& notes_str = attrs.at(kNotesAttr).s();
+  nlohmann::json j_obj = nlohmann::json::parse(notes_str);
+  cache_dir = j_obj["backend_cache_dir"].get<std::string>();
+  cache_key = j_obj["backend_cache_key"].get<std::string>();
+  if (cache_dir.empty()) {
+    LOGS_DEFAULT(WARNING) << "Retrieved backend cache dir empty";
+  }
+  if (cache_key.empty()) {
+    LOGS_DEFAULT(WARNING) << "Retrieved backend cache key empty";
+  }
+}
+
+std::unique_ptr<GraphViewer> RetrieveOriginalGraph(const Graph& ep_ctx_graph) {
+  // TODO: Support for multi-node EP context model.
+  const auto* p_node = GetEPContextNodePtr(ep_ctx_graph);
+  const auto& attrs = p_node->GetAttributes();
+  const auto& notes_str = attrs.at(kNotesAttr).s();
+  nlohmann::json j_obj = nlohmann::json::parse(notes_str);
+
+  const auto& orig_model_path = j_obj["orig_model_path"].get<std::string>();
+  bool model_loaded = false;
+  auto p_model_proto = ONNX_NAMESPACE::ModelProto::Create();
+  if (!orig_model_path.empty() && fs::exists(orig_model_path) && fs::is_regular_file(orig_model_path)) {
+    auto load_status = Model::Load(ToPathString(orig_model_path), *p_model_proto);
+    model_loaded = load_status.IsOK();
+  }
+  if (!model_loaded) {
+    p_model_proto->ParseFromString(j_obj["orig_model_proto_ser_str"].get<std::string>());
+    if (p_model_proto->opset_import_size() == 0) {
+      for (auto& elem : j_obj.items()) {
+        if (elem.key() == "orig_model_path" || elem.key() == "orig_graph_name" || elem.key() == "orig_model_proto_ser_str") {
+          continue;
+        }
+        auto* p_op_set_id_proto = p_model_proto->add_opset_import();
+        *(p_op_set_id_proto->mutable_domain()) = elem.key();
+        p_op_set_id_proto->set_version(std::stoll(elem.value().get<std::string>()));
+      }
+    }
+  }
+  auto& logger = logging::LoggingManager::DefaultLogger();
+  auto p_model = Model::Create(std::move(*p_model_proto), ToPathString(orig_model_path), nullptr, logger);
+  auto& graph = p_model->MainGraph();
+  graph.ToGraphProto()->set_name(j_obj["orig_graph_name"].get<std::string>());
+
+  return graph.CreateGraphViewer();
+}
+
+bool GraphHasEPContextNode(const Graph& graph) {
+  size_t vitisai_len = std::strlen(kVitisAI);
+  for (const auto* p_node : graph.Nodes()) {
+    if (p_node->OpType() != kEPContextOp) {
+      continue;
+    }
+    const auto& attrs = p_node->GetAttributes();
+    if (attrs.count(kSourceAttr) == 0) {
+      continue;
+    }
+    const auto& source_val = attrs.at(kSourceAttr).s();
+    if (source_val == kVitisAIExecutionProvider) {
+      return true;
+    }
+    if (source_val.length() != vitisai_len) {
+      continue;
+    }
+    size_t j = 0;
+    do {
+      if (static_cast<unsigned char>(std::tolower(source_val[j])) != kVitisAI[j]) {
+        break;
+      }
+      ++j;
+    } while (j < vitisai_len);
+    if (j == vitisai_len) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool FusedGraphHasEPContextNode(
+    const std::vector<IExecutionProvider::FusedNodeAndGraph>& fused_nodes_and_graphs) {
+  for (const auto& fused_node_graph : fused_nodes_and_graphs) {
+    bool has_node = GraphHasEPContextNode(fused_node_graph.filtered_graph.get().GetGraph());
+    if (has_node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+const fs::path& GetTopLevelModelPath(const GraphViewer& graph_viewer) {
+  const auto& graph = graph_viewer.GetGraph();
+  const Graph* p_graph = &graph;
+  while (p_graph->IsSubgraph()) {
+    p_graph = p_graph->ParentGraph();
+  }
+  return p_graph->ModelPath();
+}
+
+bool GetEPContextModelFileLocation(
+    const std::string& ep_ctx_model_path_cfg,
+    const PathString& model_path_str,
+    bool is_ep_ctx_model,
+    PathString& ep_ctx_model_file_loc) {
+  if (!ep_ctx_model_file_loc.empty()) {
+    return true;
+  }
+  if (!ep_ctx_model_path_cfg.empty()) {
+    ep_ctx_model_file_loc = ToPathString(ep_ctx_model_path_cfg);
+  } else if (!model_path_str.empty()) {
+    if (is_ep_ctx_model) {
+      ep_ctx_model_file_loc = model_path_str;
+    } else {
+      // Two alternatives for this case.
+      // Alternative 1:
+      // 1) Implement/override the method `IExecutionProvider::GetEpContextNodes()`.
+      // 2) And follow how the default path is implemented in `CreateEpContextModel()`
+      // in the file "graph_partitioner.cc".
+      // 3) Model dump is not required.
+      // Alternative 2:
+      // 1) Do NOT implement/override `IExecutionProvider::GetEpContextNodes()`.
+      // 2) No need to follow `CreateEpContextModel()` in the file "graph_partitioner.cc",
+      // freely implement what the default path is like.
+      // 3) Model dump is required.
+#if 0
+      ep_ctx_model_file_loc = model_path_str + ToPathString("_ctx.onnx");
+#endif
+#if 1
+      fs::path model_fs_path(model_path_str);
+      fs::path ep_ctx_model_fs_path(model_fs_path.parent_path() / model_fs_path.stem());
+      ep_ctx_model_fs_path += fs::path("_ctx.onnx");
+      ep_ctx_model_file_loc = ToPathString(ep_ctx_model_fs_path.string());
+#endif
+    }
+  }
+  return !ep_ctx_model_file_loc.empty();
+}
+
+// The file for EP context cache is in the same folder as the EP context model file.
+PathString GetEPContextCacheFileLocation(
+    const PathString& ep_ctx_model_file_loc, const PathString& model_path_str) {
+  if (!ep_ctx_model_file_loc.empty()) {
+    fs::path ep_ctx_model_fs_path(ep_ctx_model_file_loc);
+    fs::path ep_ctx_cache_fs_path(ep_ctx_model_fs_path.parent_path() / ep_ctx_model_fs_path.stem());
+    ep_ctx_cache_fs_path += fs::path("__ep_ctx_cache.bin");
+    return ToPathString(ep_ctx_cache_fs_path.string());
+  }
+  fs::path model_fs_path(model_path_str);
+  fs::path ep_ctx_cache_fs_path(model_fs_path.parent_path() / model_fs_path.stem());
+  ep_ctx_cache_fs_path += fs::path("__ep_ctx_cache.bin");
+  return ToPathString(ep_ctx_cache_fs_path.string());
+}
+
+std::string Slurp(const fs::path& file_location, bool binary_mode) {
+  // std::filesystem::value_type == onnxruntime::PathChar == ORTCHAR_T
+  // std::filesystem::string_type == onnxruntime::PathString
+  // const char* location_str = PathToUTF8String(file_location.native()).c_str();
+  std::ifstream ifs;
+  ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+  std::stringstream ss;
+  try {
+    auto open_mode = binary_mode ? (std::ios::in | std::ios::binary) : std::ios::in;
+    ifs.open(file_location.string().c_str(), open_mode);
+    ss << ifs.rdbuf();
+    if (!ss.good()) {
+      LOGS_DEFAULT(WARNING) << "Failed to write to stream";
+    }
+    ifs.close();
+  } catch (std::system_error& se) {
+    LOGS_DEFAULT(WARNING) << "Failed to read " << file_location << ": " << se.code().message();
+  }
+  return ss.str();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index e9ae93ded40c7..8c1dce0d3dc1a 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -53,6 +53,8 @@ struct OrtVitisAIEpAPI {
   std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_with_options)(
       const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options);
   uint32_t (*vaip_get_version)();
+  void (*get_backend_compilation_cache)(const std::string& model_path, const onnxruntime::Graph& graph, const char* json_config, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data);
+  void (*restore_backend_compilation_cache)(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path);
   void Ensure() {
     if (handle_)
       return;
@@ -77,6 +79,8 @@ struct OrtVitisAIEpAPI {
     }
     std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
                                            (void**)&vaip_get_version);
+    ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "get_compilation_cache", (void**)&get_backend_compilation_cache));
+    ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "restore_compilation_cache", (void**)&restore_backend_compilation_cache));
   }
 
  private:
@@ -122,13 +126,7 @@ static std::string config_to_json_str(const onnxruntime::ProviderOptions& config
 
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
-#ifndef _WIN32
   auto model_path = graph_viewer.ModelPath().string();
-#else
-  using convert_t = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_t, wchar_t> strconverter;
-  auto model_path = strconverter.to_bytes(graph_viewer.ModelPath().string());
-#endif
   if (s_library_vitisaiep.compile_onnx_model_with_options) {
     return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
   } else {
@@ -137,6 +135,17 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   }
 }
 
+void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data) {
+  const std::string& model_path = PathToUTF8String(model_path_str);
+  const onnxruntime::Graph& graph = graph_viewer.GetGraph();
+  const auto json_str = config_to_json_str(options);
+  s_library_vitisaiep.get_backend_compilation_cache(model_path, graph, json_str.c_str(), compiler_codes, cache_dir, cache_key, cache_data);
+}
+
+void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path) {
+  s_library_vitisaiep.restore_backend_compilation_cache(cache_dir, cache_key, cache_data, model_path);
+}
+
 struct MyCustomOpKernel : OpKernel {
   MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) {
     op_kernel_ =
@@ -218,9 +227,9 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto& logger = logging::LoggingManager::DefaultLogger();
     auto& model = const_cast<onnxruntime::Model&>(const_model);
     auto model_proto = model.ToProto();
-    auto file_path = model.MainGraph().ModelPath().string();
+    auto file_path = model.MainGraph().ModelPath();
     auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()};
-    auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger);
+    auto ret = Model::Create(std::move(*model_proto), ToPathString(file_path), &local_registries, logger);
     auto status = ret->MainGraph().Resolve();
     vai_assert(status.IsOK(), status.ErrorMessage());
     return ret.release();
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 40b396fda6135..3f46fbde8c714 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -107,12 +107,11 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   auto graph_proto_subgraph = graph.ToGraphProto();
   *model_proto->mutable_graph() = *graph_proto_subgraph;
   auto& logger = logging::LoggingManager::DefaultLogger();
-  auto filename_data_relative_path = std::filesystem::path();
   auto model = Model::Create(std::move(*model_proto), ToPathString(filename), nullptr, logger);
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(filename_dat, graph.ModelPath(), initializer_size_threshold);
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
@@ -124,7 +123,7 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
       *prop->mutable_value() = m.second;
     }
   }
-  std::fstream output(filename, std::ios::out | std::ios::trunc | std::ios::binary);
+  std::fstream output(ToPathString(filename), std::ios::out | std::ios::trunc | std::ios::binary);
   bool result = model_proto->SerializeToOstream(output);
   output << std::flush;
   vai_assert(result, "model serialize to ostream error");
diff --git a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
new file mode 100644
index 0000000000000..61a595cf1ae15
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h
@@ -0,0 +1,81 @@
+#pragma once
+
+// Standard headers/libs.
+#include <filesystem>
+#include <vector>
+#include <string>
+#include <memory>
+
+// 1st-party headers/libs.
+#include "core/providers/shared_library/provider_api.h"
+
+namespace fs = std::filesystem;
+
+namespace onnxruntime {
+
+constexpr const uint8_t kXCCode = 1;
+constexpr const uint8_t kDDCode = 2;
+constexpr const uint8_t kVCode = 4;
+
+static constexpr const char* kEPContextOp = "EPContext";
+static constexpr const char* kMainContextAttr = "main_context";
+static constexpr const char* kEPCacheContextAttr = "ep_cache_context";
+static constexpr const char* kEmbedModeAttr = "embed_mode";
+static constexpr const char* kPartitionNameAttr = "partition_name";
+static constexpr const char* kSourceAttr = "source";
+static constexpr const char* kEPSDKVersionAttr = "ep_sdk_version";
+static constexpr const char* kONNXModelFileNameAttr = "onnx_model_filename";
+static constexpr const char* kNotesAttr = "notes";
+static constexpr const char* kEPContextOpDomain = "com.microsoft";
+static constexpr const char* kEPContextOpName = "VitisAIEPContextOp";
+
+std::unique_ptr<ONNX_NAMESPACE::FunctionProto>
+ConvertIndexedSubGraphToFunctionProto(const IndexedSubGraph&, const Graph&);
+
+std::unique_ptr<IndexedSubGraph> ConvertFunctionProtoToIndexedSubGraph(
+    const std::unique_ptr<ONNX_NAMESPACE::FunctionProto>&);
+
+std::string SerializeCapabilities(
+    const std::vector<std::unique_ptr<ComputeCapability>>&, const Graph&);
+
+void DeserializeCapabilities(
+    const std::string&, std::vector<std::unique_ptr<ComputeCapability>>&);
+
+std::string SerializeOrigialGraph(const GraphViewer&);
+
+// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc".
+ONNX_NAMESPACE::ModelProto* CreateEPContexModel(const GraphViewer&, const std::string&, const std::string&, const int64_t,
+                                                const std::string&, const std::string&, bool, const logging::Logger*);
+
+// Ref.: `static common::Status Save(Model& model, int fd)` in the file "model.h".
+void DumpEPContextModel(const std::unique_ptr<ONNX_NAMESPACE::ModelProto>&, const std::string&);
+
+const Node* GetEPContextNodePtr(const Graph&);
+
+bool ValidateEPContextNode(const Graph&);
+
+void CreateEPContexNodes(Graph*, const std::vector<IExecutionProvider::FusedNodeAndGraph>&, const std::string&, const std::string&,
+                         const int64_t, const std::string&, const std::string&, bool, const logging::Logger*);
+
+std::string RetrieveEPContextCache(const Graph&, const PathString&, bool binary_mode = true);
+
+void RetrieveBackendCacheInfo(const Graph&, std::string&, std::string&);
+
+std::unique_ptr<GraphViewer> RetrieveOriginalGraph(const Graph&);
+
+bool GraphHasEPContextNode(const Graph&);
+
+bool FusedGraphHasEPContextNode(
+    const std::vector<IExecutionProvider::FusedNodeAndGraph>&);
+
+const fs::path& GetTopLevelModelPath(const GraphViewer&);
+
+bool GetEPContextModelFileLocation(
+    const std::string&, const PathString&, bool, PathString&);
+
+// The file for EP context cache is in the same folder as the EP context model file.
+PathString GetEPContextCacheFileLocation(const PathString&, const PathString&);
+
+std::string Slurp(const fs::path&, bool binary_mode = false);
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index 1f8b8802e86b4..3fdbc60bb0ee6 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -14,3 +14,5 @@ void initialize_vitisai_ep();
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options);
 std::shared_ptr<onnxruntime::KernelRegistry> get_kernel_registry_vitisaiep();
 const std::vector<OrtCustomOpDomain*>& get_domains_vitisaiep();
+void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data);
+void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path);
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 6fc09f3495aa1..f45b89649bfcb 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -2,22 +2,43 @@
 // Licensed under the MIT License.
 #include "vitisai_execution_provider.h"
 
+// Standard headers/libs.
 #include <cassert>
 #include <fstream>
 #include <istream>
+#include <filesystem>
+
+// 1st-party headers/libs.
+#include "core/platform/env_var_utils.h"
+#include "core/common/exceptions.h"
 
 #include "vaip/capability.h"
 #include "vaip/global_api.h"
+#include "ep_context_utils.h"
 
 using namespace ONNX_NAMESPACE;
 
+namespace fs = std::filesystem;
+
 namespace onnxruntime {
 constexpr const char* VITISAI = "VITISAI";
 
 VitisAIExecutionProvider::VitisAIExecutionProvider(
     const ProviderOptions& info)
+    // const ProviderOptions& info, const SessionOptions* p_sess_opts)
     : IExecutionProvider{onnxruntime::kVitisAIExecutionProvider}, info_(info) {
   CreateKernelRegistry();
+
+  auto it = info_.find("ep_context_enable");
+  ep_ctx_enabled_ = it != info_.end() && it->second == "1";
+  it = info_.find("ep_context_embed_mode");
+  ep_ctx_embed_mode_ = it != info_.end() && it->second != "0";
+  // ep_ctx_embed_mode_ = it == info_.end() || it->second != "0";
+  it = info_.find("ep_context_file_path");
+  ep_ctx_model_path_cfg_ = it == info_.end() ? "" : it->second;
+  LOGS_DEFAULT(VERBOSE) << "EP Context cache enabled: " << ep_ctx_enabled_;
+  LOGS_DEFAULT(VERBOSE) << "EP context cache embed mode: " << ep_ctx_embed_mode_;
+  LOGS_DEFAULT(VERBOSE) << "User specified EP context cache path: " << ep_ctx_model_path_cfg_;
 }
 
 void VitisAIExecutionProvider::CreateKernelRegistry() {
@@ -30,9 +51,115 @@ void VitisAIExecutionProvider::CreateKernelRegistry() {
 
 std::shared_ptr<KernelRegistry> VitisAIExecutionProvider::GetKernelRegistry() const { return get_kernel_registry_vitisaiep(); }
 
+// This method is called after both `GetComputeCapabilityOps()` and `Compile()`.
+// This timing is required to work with both compilation-based EPs and non-compilation-based EPs.
+const InlinedVector<const Node*> VitisAIExecutionProvider::GetEpContextNodes() const {
+  InlinedVector<const Node*> ep_context_node_ptrs;
+  // All preconditions are supposed to have happened.
+  if (p_ep_ctx_model_) {
+    auto& graph = p_ep_ctx_model_->MainGraph();
+    for (const auto* p_node : graph.Nodes()) {
+      ep_context_node_ptrs.push_back(p_node);
+    }
+  }
+  return ep_context_node_ptrs;
+}
+
+void VitisAIExecutionProvider::LoadEPContexModelFromFile() const {
+  // XXX: should "p_ep_ctx_model_" be checked or not?
+  if (!p_ep_ctx_model_ && !ep_ctx_model_file_loc_.empty()) {
+    auto status = Model::Load(ep_ctx_model_file_loc_, *p_ep_ctx_model_proto_);
+    if (!status.IsOK()) {
+      ORT_THROW("Loading EP context model failed from ", PathToUTF8String(ep_ctx_model_file_loc_));
+    }
+    p_ep_ctx_model_ = Model::Create(std::move(*p_ep_ctx_model_proto_), ep_ctx_model_file_loc_, nullptr, *GetLogger());
+    LOGS_DEFAULT(VERBOSE) << "Loaded EP context model from: " << PathToUTF8String(ep_ctx_model_file_loc_);
+  } else if (ep_ctx_model_file_loc_.empty()) {
+    LOGS_DEFAULT(WARNING) << "Cannot load an EP-context model due to bad file path";
+  }
+}
+
+void VitisAIExecutionProvider::PrepareEPContextEnablement(
+    const onnxruntime::GraphViewer& graph_viewer) const {
+  if (model_path_str_.empty()) {
+    // TODO: platform dependency (Linux vs Windows).
+    model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string());
+  }
+  std::string backend_cache_dir, backend_cache_key;
+  get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode, backend_cache_dir, backend_cache_key, backend_cache_data_);
+  info_["cacheDir"] = backend_cache_dir;
+  info_["cacheKey"] = backend_cache_key;
+  // Create a new model, reusing the graph name, the op-domain-to-opset-version map,
+  // the op schema registry of the current graph, etc.
+  p_ep_ctx_model_ = graph_viewer.CreateModel(*GetLogger());
+  LOGS_DEFAULT(VERBOSE) << "Container model created";
+}
+
+void VitisAIExecutionProvider::FulfillEPContextEnablement(
+    const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs) {
+  auto& ep_ctx_graph = p_ep_ctx_model_->MainGraph();
+  if (!ep_ctx_embed_mode_) {
+    auto ep_ctx_cache_path_str = GetEPContextCacheFileLocation(ep_ctx_model_file_loc_, model_path_str_);
+    std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc);
+    if (!ep_ctx_cache_ofs.is_open()) {
+      ORT_THROW("Failed to open a file to write EP context cache: ", ep_ctx_cache_path_str.c_str());
+    }
+    ep_ctx_cache_ofs.write(backend_cache_data_.c_str(), backend_cache_data_.length());
+    if (!ep_ctx_cache_ofs.good()) {
+      ep_ctx_cache_ofs.close();
+      ORT_THROW("Exception writing EP context cache file: ", ep_ctx_cache_path_str.c_str());
+    }
+    ep_ctx_cache_ofs.close();
+    CreateEPContexNodes(&ep_ctx_graph, fused_nodes_and_graphs, "", PathToUTF8String(ep_ctx_cache_path_str), 0, info_.at("cacheDir"), info_.at("cacheKey"), false, GetLogger());
+  } else {
+    CreateEPContexNodes(&ep_ctx_graph, fused_nodes_and_graphs, backend_cache_data_, "", 1, info_["cacheDir"], info_["cacheKey"], false, GetLogger());
+  }
+  if (GraphHasEPContextNode(ep_ctx_graph)) {
+    LOGS_DEFAULT(VERBOSE) << "Created model has EP context nodes";
+  } else {
+    LOGS_DEFAULT(WARNING) << "No EP eontext nodes created";
+  }
+}
+
 std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCapability(
-    const onnxruntime::GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const {
-  if (graph.IsSubgraph()) {
+    const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const {
+  bool is_ep_ctx_model = GraphHasEPContextNode(graph_viewer.GetGraph());
+  // TODO: platform dependency (Linux vs Windows).
+  model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string());
+  if (GetEPContextModelFileLocation(
+          ep_ctx_model_path_cfg_, model_path_str_, is_ep_ctx_model, ep_ctx_model_file_loc_)) {
+    if (is_ep_ctx_model) {
+      LOGS_DEFAULT(VERBOSE) << "An EP context model passed in";
+      ValidateEPContextNode(graph_viewer.GetGraph());
+      std::string cache_dir, cache_key;
+      RetrieveBackendCacheInfo(graph_viewer.GetGraph(), cache_dir, cache_key);
+      info_["cacheDir"] = cache_dir;
+      info_["cacheKey"] = cache_key;
+      LOGS_DEFAULT(VERBOSE) << "Trying getting compilation cache from " << PathToUTF8String(ep_ctx_model_file_loc_);
+      auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, false);
+      restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string());
+    } else {
+      if (fs::exists(ep_ctx_model_file_loc_) && fs::is_regular_file(ep_ctx_model_file_loc_) && ep_ctx_enabled_) {
+        ORT_THROW("The inference session was created with a normal ONNX model but a model file with EP context cache exists at ",
+                  PathToUTF8String(ep_ctx_model_file_loc_), ". Please remove the EP context model manually if you want to re-generate it.");
+        // Disable the flexibility implemented below by throwing an exception.
+        // Now the code below is unreachable but DCE will take care of it.
+        // We might want to re-enable it in future, so we keep it as is.
+        LoadEPContexModelFromFile();
+        ValidateEPContextNode(p_ep_ctx_model_->MainGraph());
+        std::string cache_dir, cache_key;
+        RetrieveBackendCacheInfo(p_ep_ctx_model_->MainGraph(), cache_dir, cache_key);
+        info_["cacheDir"] = cache_dir;
+        info_["cacheKey"] = cache_key;
+        auto ep_ctx_payload = RetrieveEPContextCache(p_ep_ctx_model_->MainGraph(), ep_ctx_model_file_loc_, false);
+        restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string());
+      }
+    }
+  } else {
+    LOGS_DEFAULT(WARNING) << "Failed to get EP context model file location";
+  }
+
+  if (graph_viewer.IsSubgraph()) {
     // VITIS AI EP not support sungraph. Assigned to CPU.
     return {};
   }
@@ -40,13 +167,16 @@ std::vector<std::unique_ptr<ComputeCapability>> VitisAIExecutionProvider::GetCap
     // Only compiling a model once is currently supported
     return {};
   }
-  execution_providers_ = std::make_unique<my_ep_t>(compile_onnx_model(graph, *GetLogger(), info_));
-  auto result = vaip::GetComputeCapabilityOps(graph, execution_providers_.get(), vitisai_optypes_);
+  execution_providers_ = std::make_unique<my_ep_t>(compile_onnx_model(graph_viewer, *GetLogger(), info_));
+  auto result = vaip::GetComputeCapabilityOps(graph_viewer, execution_providers_.get(), vitisai_optypes_);
   size_t index = 0u;
   for (auto& ep : **execution_providers_) {
-    result.emplace_back(vaip::XirSubgraphToComputeCapability1(graph, ep.get(), index));
+    result.emplace_back(vaip::XirSubgraphToComputeCapability1(graph_viewer, ep.get(), index));
     index = index + 1;
   }
+  if (ep_ctx_enabled_ && !is_ep_ctx_model) {
+    PrepareEPContextEnablement(graph_viewer);
+  }
   return result;
 }
 
@@ -74,6 +204,10 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
     };
     node_compute_funcs.push_back(compute_info);
   }
+  if (ep_ctx_enabled_ && p_ep_ctx_model_) {
+    FulfillEPContextEnablement(fused_nodes_and_graphs);
+  }
   return Status::OK();
 }
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 186427be4fab2..d055c1aa9b9b4 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -3,14 +3,18 @@
 
 #pragma once
 
+// Standard headers/libs.
 #include <ctime>
 #include <vector>
 #include <memory>
 #include <set>
 #include <string>
 
+// 1st-party headers/libs.
+// #include "core/framework/session_options.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/session/onnxruntime_c_api.h"
+#include "core/common/inlined_containers_fwd.h"
 
 // we cannot include vaip/vaip.hpp here because header file referred by
 // onnxruntime_pybind_state_common.cc
@@ -24,9 +28,11 @@ namespace onnxruntime {
 class VitisAIExecutionProvider : public IExecutionProvider {
  public:
   explicit VitisAIExecutionProvider(const ProviderOptions& info);
+  // explicit VitisAIExecutionProvider(const ProviderOptions& info,
+  //     const SessionOptions* p_sess_opts = nullptr);
   ~VitisAIExecutionProvider() = default;
 
-  std::vector<std::unique_ptr<ComputeCapability>> GetCapability(const onnxruntime::GraphViewer& graph,
+  std::vector<std::unique_ptr<ComputeCapability>> GetCapability(const onnxruntime::GraphViewer& graph_viewer,
                                                                 const IKernelLookup& /*kernel_lookup*/) const override;
 
   int GetDeviceId() const { return 0; }
@@ -35,16 +41,34 @@ class VitisAIExecutionProvider : public IExecutionProvider {
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
   std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
 
+  // This method is called after both `GetComputeCapabilityOps()` and `Compile()`.
+  // This timing is required to work with both compliation-based EPs and non-compilation-based EPs.
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
  private:
   void CreateKernelRegistry();
   using my_ep_t = vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>>;
   using my_ep_uptr_t = std::shared_ptr<my_ep_t>;
   // we have to hide the implementation by forward declaration.
   mutable my_ep_uptr_t execution_providers_;
-  ProviderOptions info_;
+  mutable ProviderOptions info_;
   std::vector<OrtCustomOpDomain*> custom_op_domains_;
   std::shared_ptr<KernelRegistry> registry_;
   std::set<std::string> vitisai_optypes_;
+  // EP context related.
+  bool ep_ctx_enabled_ = false;
+  bool ep_ctx_embed_mode_ = true;
+  std::string ep_ctx_model_path_cfg_{""};
+  mutable std::string backend_cache_data_{""};
+  mutable PathString model_path_str_{};
+  mutable PathString ep_ctx_model_file_loc_{};
+  mutable std::unique_ptr<onnxruntime::Model> p_ep_ctx_model_;
+  mutable std::unique_ptr<ONNX_NAMESPACE::ModelProto> p_ep_ctx_model_proto_;
+  // It might need to be called before loading
+  // the EP context model that is compiled AOT/offline.
+  void LoadEPContexModelFromFile() const;
+  void PrepareEPContextEnablement(const onnxruntime::GraphViewer&) const;
+  void FulfillEPContextEnablement(const std::vector<FusedNodeAndGraph>&);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
old mode 100755
new mode 100644
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 3ef6490a56ded..f0eed91d70440 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -881,8 +881,6 @@ common::Status InferenceSession::RegisterGraphTransformer(
 }
 
 common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& filepath) const {
-  ORT_RETURN_IF_NOT(FLATBUFFERS_LITTLEENDIAN, "ort format only supports little-endian machines");
-
   // Get the byte size of the ModelProto and round it to the next MB and use it as flatbuffers' init_size
   // TODO: Investigate whether we should set a max size, and clarify the cost of having a buffer smaller than
   // what the total flatbuffers serialized size will be.
@@ -1390,8 +1388,6 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len
 }
 
 Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort_format_model_bytes) {
-  static_assert(FLATBUFFERS_LITTLEENDIAN, "ORT format only supports little-endian machines");
-
   std::lock_guard<onnxruntime::OrtMutex> l(session_mutex_);
 
   if (is_model_loaded_) {  // already loaded
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index b53e70926cd5d..4f9669a7dcc4c 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -28,6 +28,7 @@
 #include "core/session/inference_session.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/ort_apis.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/provider_bridge_ort.h"
 #include "core/util/math.h"
 #include "core/framework/sparse_utils.h"
@@ -68,10 +69,12 @@ using StringStringEntryProtos = google::protobuf::RepeatedPtrField<StringStringE
 using TensorProtos = google::protobuf::RepeatedPtrField<TensorProto>;
 using TensorShapeProto_Dimensions = google::protobuf::RepeatedPtrField<TensorShapeProto_Dimension>;
 using ValueInfoProtos = google::protobuf::RepeatedPtrField<ValueInfoProto>;
+using FunctionProtos = google::protobuf::RepeatedPtrField<FunctionProto>;
 }  // namespace ONNX_NAMESPACE
 
 namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
+using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
@@ -132,6 +135,8 @@ ProviderInfo_Dnnl& GetProviderInfo_Dnnl();
 ProviderInfo_ROCM* TryGetProviderInfo_ROCM();
 ProviderInfo_ROCM& GetProviderInfo_ROCM();
 ProviderHostCPU& GetProviderHostCPU();
+ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX();
+ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX();
 ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector<const OrtCustomOp*>& ops);
 struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
   TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
@@ -243,6 +248,11 @@ struct ProviderHostImpl : ProviderHost {
   void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); }
 #endif
 
+#ifdef USE_MIGRAPHX
+  std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(device_id, name); }
+  std::unique_ptr<IAllocator> CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXPinnedAllocator(device_id, name); }
+#endif
+
 #ifdef USE_ROCM
   std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_ROCM().CreateROCMAllocator(device_id, name); }
   std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name) override { return GetProviderInfo_ROCM().CreateROCMPinnedAllocator(name); }
@@ -393,6 +403,11 @@ struct ProviderHostImpl : ProviderHost {
   int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) override { return p->size(); }
   ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) override { return p->at(index); };
 
+  // OperatorSetIdProto
+  std::string* OperatorSetIdProto__mutable_domain(ONNX_NAMESPACE::OperatorSetIdProto* p) override { return p->mutable_domain(); }
+  void OperatorSetIdProto__set_version(ONNX_NAMESPACE::OperatorSetIdProto* p, int64_t version) override { return p->set_version(version); }
+  int64_t OperatorSetIdProto__version(const ONNX_NAMESPACE::OperatorSetIdProto* p) override { return p->version(); }
+
 #if !defined(DISABLE_OPTIONAL_TYPE)
   // TypeProto_Optional (wrapped)
   const ONNX_NAMESPACE::TypeProto& TypeProto_Optional__elem_type(const ONNX_NAMESPACE::TypeProto_Optional* p) override { return p->elem_type(); }
@@ -521,6 +536,11 @@ struct ProviderHostImpl : ProviderHost {
   void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) override { p->set_ir_version(value); }
   ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) override { return p->mutable_metadata_props(); };
 
+  const ONNX_NAMESPACE::OperatorSetIdProto& ModelProto__opset_import(const ONNX_NAMESPACE::ModelProto* p, int index) override { return p->opset_import(index); }
+  ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__mutable_opset_import(ONNX_NAMESPACE::ModelProto* p, int index) override { return p->mutable_opset_import(index); }
+  int ModelProto__opset_import_size(const ONNX_NAMESPACE::ModelProto* p) override { return p->opset_import_size(); }
+  ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__add_opset_import(ONNX_NAMESPACE::ModelProto* p) override { return p->add_opset_import(); }
+
   // NodeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::NodeProto> NodeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::NodeProto>(); }
   void NodeProto__operator_delete(ONNX_NAMESPACE::NodeProto* p) override { delete p; }
@@ -528,6 +548,7 @@ struct ProviderHostImpl : ProviderHost {
   int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) override { return p->attribute_size(); }
   const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const override { return p->attribute(index); }
   ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) override { return p->mutable_attribute(index); }
+  ONNX_NAMESPACE::AttributeProto* NodeProto__add_attribute(ONNX_NAMESPACE::NodeProto* p) override { return p->add_attribute(); }
 
   // TensorProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TensorProto>(); }
@@ -602,6 +623,64 @@ struct ProviderHostImpl : ProviderHost {
 
   const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) override { return (*p)[index]; }
 
+  // FunctionProto (wrapped)
+  std::unique_ptr<ONNX_NAMESPACE::FunctionProto> FunctionProto__construct() override { return std::make_unique<ONNX_NAMESPACE::FunctionProto>(); }
+  void FunctionProto__operator_delete(ONNX_NAMESPACE::FunctionProto* p) override { delete p; }
+
+  bool FunctionProto__SerializeToString(const ONNX_NAMESPACE::FunctionProto* p, std::string& string) override { return p->SerializeToString(&string); }
+  bool FunctionProto__SerializeToOstream(const ONNX_NAMESPACE::FunctionProto* p, std::ostream& output) override { return p->SerializeToOstream(&output); }
+  bool FunctionProto__ParseFromString(ONNX_NAMESPACE::FunctionProto* p, const std::string& data) override { return p->ParseFromString(data); }
+  std::string FunctionProto__SerializeAsString(const ONNX_NAMESPACE::FunctionProto* p) override { return p->SerializeAsString(); }
+
+  bool FunctionProto__has_name(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_name(); }
+  const std::string& FunctionProto__name(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->name(); }
+  void FunctionProto__set_name(ONNX_NAMESPACE::FunctionProto* p, const std::string& name) override { p->set_name(name); }
+
+  bool FunctionProto__has_doc_string(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_doc_string(); }
+  const std::string& FunctionProto__doc_string(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->doc_string(); }
+  void FunctionProto__set_doc_string(ONNX_NAMESPACE::FunctionProto* p, const std::string& doc_string) override { p->set_doc_string(doc_string); }
+
+  bool FunctionProto__has_domain(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_domain(); }
+  const std::string& FunctionProto__domain(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->domain(); }
+  void FunctionProto__set_domain(ONNX_NAMESPACE::FunctionProto* p, const std::string& domain) override { p->set_domain(domain); }
+
+  const std::string& FunctionProto__input(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->input(index); }
+  std::string* FunctionProto__mutable_input(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_input(index); }
+  int FunctionProto__input_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->input_size(); }
+  void FunctionProto__add_input(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_input(value); }
+
+  const std::string& FunctionProto__output(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->output(index); }
+  std::string* FunctionProto__mutable_output(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_output(index); }
+  int FunctionProto__output_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->output_size(); }
+  void FunctionProto__add_output(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_output(value); }
+
+  const std::string& FunctionProto__attribute(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->attribute(index); }
+  std::string* FunctionProto__mutable_attribute(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_attribute(index); }
+  int FunctionProto__attribute_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->attribute_size(); }
+  void FunctionProto__add_attribute(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_attribute(value); }
+
+  const ONNX_NAMESPACE::AttributeProto& FunctionProto__attribute_proto(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->attribute_proto(index); }
+  ONNX_NAMESPACE::AttributeProto* FunctionProto__mutable_attribute_proto(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_attribute_proto(index); }
+  int FunctionProto__attribute_proto_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->attribute_proto_size(); }
+  ONNX_NAMESPACE::AttributeProto* FunctionProto__add_attribute_proto(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_attribute_proto(); }
+
+  const ONNX_NAMESPACE::NodeProto& FunctionProto__node(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->node(index); }
+  ONNX_NAMESPACE::NodeProto* FunctionProto__mutable_node(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_node(index); }
+  int FunctionProto__node_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->node_size(); }
+  ONNX_NAMESPACE::NodeProto* FunctionProto__add_node(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_node(); }
+
+  const ONNX_NAMESPACE::ValueInfoProto& FunctionProto__value_info(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->value_info(index); }
+  ONNX_NAMESPACE::ValueInfoProto* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_value_info(index); }
+  ONNX_NAMESPACE::ValueInfoProtos* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p) override { return p->mutable_value_info(); }
+  int FunctionProto__value_info_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->value_info_size(); }
+  ONNX_NAMESPACE::ValueInfoProto* FunctionProto__add_value_info(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_value_info(); }
+
+  const ONNX_NAMESPACE::StringStringEntryProto& FunctionProto__metadata_props(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->metadata_props(index); }
+  ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_metadata_props(index); }
+  ONNX_NAMESPACE::StringStringEntryProtos* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p) override { return p->mutable_metadata_props(); }
+  int FunctionProto__metadata_props_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->metadata_props_size(); }
+  ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__add_metadata_props(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_metadata_props(); }
+
   static int32_t convert_elem_type(const ONNX_NAMESPACE::AttributeProto* data_type) {
     int32_t elemType = 0;
     if (data_type->s() == "float32") {
@@ -784,9 +863,12 @@ struct ProviderHostImpl : ProviderHost {
 
   std::vector<onnxruntime::NodeIndex>& IndexedSubGraph__Nodes(IndexedSubGraph* p) override { return p->nodes; }
 
-  void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) override { return p->SetMetaDef(std::move(meta_def_)); }
+  void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr<IndexedSubGraph_MetaDef>&& meta_def_) override { p->SetMetaDef(std::move(meta_def_)); }
   const IndexedSubGraph_MetaDef* IndexedSubGraph__GetMetaDef(const IndexedSubGraph* p) override { return p->GetMetaDef(); }
 
+  void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) override { p->schema_source = schema_source; }
+  IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) override { return p->schema_source; }
+
   // KernelDef (wrapped)
   void KernelDef__operator_delete(KernelDef* p) override { delete p; }
   void KernelDef__SinceVersion(const KernelDef* p, int* start, int* end) override { return p->SinceVersion(start, end); }
@@ -1954,6 +2036,20 @@ ProviderInfo_ROCM& GetProviderInfo_ROCM() {
   ORT_THROW("ROCM Provider not available, can't get interface for it");
 }
 
+ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX() try {
+  return reinterpret_cast<ProviderInfo_MIGraphX*>(s_library_migraphx.Get().GetInfo());
+} catch (const std::exception& exception) {
+  LOGS_DEFAULT(ERROR) << exception.what();
+  return nullptr;
+}
+
+ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX() {
+  if (auto* info = TryGetProviderInfo_MIGraphX())
+    return *info;
+
+  ORT_THROW("MIGraphX Provider not available, can't get interface for it");
+}
+
 void CopyGpuToCpu(
     void* dst_ptr,
     const void* src_ptr,
@@ -2821,6 +2917,11 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_VitisAI, _In_
 
     provider_options[provider_options_keys[i]] = provider_options_values[i];
   }
+  // EP context related session config options.
+  provider_options["ep_context_enable"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0");
+  provider_options["ep_context_embed_mode"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+  provider_options["ep_context_file_path"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+
   auto factory = onnxruntime::VitisAIProviderFactoryCreator::Create(provider_options);
   if (!factory) {
     return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_VitisAI: Failed to load shared library");
diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
index 5e8e5c1a2a2fc..51a52af1b151e 100644
--- a/onnxruntime/python/onnxruntime_pybind_quant.cc
+++ b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -67,7 +67,7 @@ void QuantizeMatMul4BitsBlockwise(
 }
 
 template <typename T>
-void QuantizeQDQMatMul4BitsBlockwise(
+bool QuantizeQDQMatMul4BitsBlockwise(
     py::array_t<uint8_t> dst,          // shape: [K, N / 2]
     py::array_t<T> src,                // shape: [K, N]
     py::array_t<T> scale,              // shape: [block_per_K, N]
@@ -85,7 +85,7 @@ void QuantizeQDQMatMul4BitsBlockwise(
   py::buffer_info scale_buf = scale.request();
   py::buffer_info zp_buf = zero_points.request();
 
-  MlasQDQQuantizeBlockwise<T, 4>(
+  return MlasQDQQuantizeBlockwise<T, 4>(
       reinterpret_cast<const T*>(src_buf.ptr),
       reinterpret_cast<T*>(scale_buf.ptr),
       is_symmetric ? nullptr : reinterpret_cast<uint8_t*>(zp_buf.ptr),
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e539614fd6d1d..e13285c60e69f 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1114,6 +1114,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     if (it != provider_options_map.end()) {
       info = it->second;
     }
+    info["ep_context_enable"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0");
+    info["ep_context_embed_mode"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+    info["ep_context_file_path"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     return onnxruntime::VitisAIProviderFactoryCreator::Create(info)->CreateProvider();
 #endif
   } else if (type == kAclExecutionProvider) {
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 11a830dc6d7f5..40a4a4d26dc1c 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -18,31 +18,36 @@
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
 from packaging import version
 
-from onnxruntime.capi._pybind_state import quantize_matmul_4bits
+from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
 
 from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg
+from .quant_utils import QuantFormat, attribute_to_kwarg
 
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class WeightOnlyQuantConfig:
-    def __init__(self, algorithm):
+    def __init__(self, algorithm, quant_format):
         """This is the Base class for Weight Only Quant Configuration.
 
         Args:
             algorithm:
                 weight only quantize algorithm name.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
         """
         self.algorithm = algorithm
+        self.quant_format = quant_format
 
 
 class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
     def __init__(
         self,
         ratios=None,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
@@ -51,11 +56,18 @@ def __init__(
         Args:
             ratios:
                 percentile of clip. Defaults to {}.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
+
         if ratios is None:
             ratios = {}
         super().__init__(
             algorithm="RTN",
+            quant_format=quant_format,
         )
         self.ratios = ratios
 
@@ -69,6 +81,7 @@ def __init__(
         actorder=False,
         mse=False,
         perchannel=True,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for GPTQ algorithm Weight Only Quant Configuration.
@@ -87,9 +100,16 @@ def __init__(
                 whether get scale and zero point with mse error.
             perchannel (bool, optional):
                 whether quantize weight per-channel.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format"
+
         super().__init__(
             algorithm="GPTQ",
+            quant_format=quant_format,
         )
         self.calibration_data_reader = calibration_data_reader
         self.percdamp = percdamp
@@ -105,6 +125,7 @@ def __init__(
         block_size=128,
         bits=4,
         axis=1,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for HQQ algorithm Weight Only Quant Configuration.
@@ -112,14 +133,21 @@ def __init__(
 
         Args:
             block_size (int, optional):
-                channel number in one block to execute a GPTQ quantization iteration.
+                channel number in one block to execute a HQQ quantization iteration.
             bits (int, optional):
                 how many bits to represent weight.
             axis (int, optional):
                 0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format"
+
         super().__init__(
             algorithm="HQQ",
+            quant_format=quant_format,
         )
         self.block_size = block_size
         self.bits = bits
@@ -132,8 +160,26 @@ def __init__(
         block_size: int = 128,
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
+        quant_format=QuantFormat.QOperator,
     ):
-        super().__init__(algorithm="DEFAULT")
+        """
+        This is a class for weight only affine quantization configuration.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute an affine quantization iteration.
+            is_symmetric (bool, optional):
+                whether quantize weight symmetrically.
+            accuracy_level (int, optional):
+                Accuracy level of the 4-bit quantized MatMul computation.
+                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
+                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        super().__init__(algorithm="DEFAULT", quant_format=quant_format)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.bits = 4
@@ -287,23 +333,26 @@ def quantize_internal(
 
         return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
         import torch
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        b_pb, bs_graph = get_initializer(inputB, graph_stack)
+        input_b = node.input[1]
+        b_pb, bs_graph = get_initializer(input_b, graph_stack)
         if b_pb is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
         b_array = onnx.numpy_helper.to_array(b_pb)
         if len(b_array.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
+            return [node]  # can only process 2-D matrix
         b_array_torch = torch.from_numpy(b_array)
         if torch.cuda.is_available():
             b_array_torch = b_array_torch.cuda()
@@ -334,7 +383,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
         b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
         b_quant.name = b_pb.name + "_Q4"
         for input in bs_graph.input:
-            if input.name == inputB:
+            if input.name == input_b:
                 bs_graph.input.remove(input)
                 break
 
@@ -366,7 +415,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
 
         logger.info(f"complete quantization of {node.name} ...")
 
-        return matmul_q4_node
+        return [matmul_q4_node]
 
 
 def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -382,7 +431,7 @@ class DefaultWeightOnlyQuantizer:
     def __init__(self, config: DefaultWeightOnlyQuantConfig):
         self.config = config
 
-    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
+    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """4b quantize fp32 weight to a blob"""
 
         if len(fp32weight.shape) != 2:
@@ -390,83 +439,136 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         rows, cols = fp32weight.shape
 
         block_size = self.config.block_size
-        blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
-        padded_rows = k_blocks * block_size
-        pad_len = padded_rows - rows
-        if pad_len > 0:
-            fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
 
-        # block wise quantization, each block comes from a single column
-        packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
-        scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-        zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
-        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric)
+        if self.config.quant_format == QuantFormat.QOperator:
+            blob_size = block_size // 2
+            padded_rows = k_blocks * block_size
+            pad_len = padded_rows - rows
+            if pad_len > 0:
+                fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
+
+            # block wise quantization, each block comes from a single column
+            packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
+            zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
+            scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
+            quantize_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
+        else:
+            packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
+            zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
+            scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
+            quantize_qdq_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
 
         return (packed, scales, zero_point)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
 
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = get_initializer(inputB, graph_stack)  # noqa: N806
-        if B is None:
+        qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
+        input_b = node.input[1]
+        b_tensor, b_graph = get_initializer(input_b, graph_stack)
+        if b_tensor is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
-        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
-        if len(B_array.shape) != 2:
+        b_ndarray = onnx.numpy_helper.to_array(b_tensor)
+        if len(b_ndarray.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
-
-        packed, scales, zero_points = self.int4_block_quant(B_array)
-        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
-        B_quant.name = B.name + "_Q4"
-        for input in Bs_graph.input:
-            if input.name == inputB:
-                Bs_graph.input.remove(input)
-                break
+            return [node]  # can only process 2-D matrix
 
-        scales_tensor = onnx.numpy_helper.from_array(scales)
-        scales_tensor.name = B.name + "_scales"
-        Bs_graph.initializer.extend([B_quant, scales_tensor])
+        packed, scales, zero_points = self.int4_block_quant(b_ndarray)
 
-        input_names = [node.input[0], B_quant.name, scales_tensor.name]
-        if not self.config.is_symmetric:
-            zp_tensor = onnx.numpy_helper.from_array(zero_points)
-            zp_tensor.name = B.name + "_zero_points"
-            Bs_graph.initializer.extend([zp_tensor])
-            input_names.append(zp_tensor.name)
+        if self.config.quant_format == QuantFormat.QOperator:
+            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
+        else:
+            b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
 
-        kwargs = {}
-        rows, cols = B_array.shape
-        kwargs["K"] = rows
-        kwargs["N"] = cols
-        kwargs["bits"] = 4
-        kwargs["block_size"] = self.config.block_size
-        if self.config.accuracy_level is not None:
-            kwargs["accuracy_level"] = self.config.accuracy_level
+        for input in b_graph.input:
+            if input.name == input_b:
+                b_graph.input.remove(input)
+                break
 
-        matmul_q4_node = onnx.helper.make_node(
-            "MatMulNBits",
-            inputs=input_names,
-            outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
-            domain="com.microsoft",
-            **kwargs,
-        )
+        b_graph.initializer.extend([b_quant, scales_tensor])
+
+        output_nodes = []
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            input_names = [node.input[0], b_quant.name, scales_tensor.name]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points")
+                input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            kwargs = {}
+            rows, cols = b_ndarray.shape
+            kwargs["K"] = rows
+            kwargs["N"] = cols
+            kwargs["bits"] = 4
+            kwargs["block_size"] = self.config.block_size
+            if self.config.accuracy_level is not None:
+                kwargs["accuracy_level"] = self.config.accuracy_level
+
+            matmul_q4_node = onnx.helper.make_node(
+                "MatMulNBits",
+                inputs=input_names,
+                outputs=[node.output[0]],
+                name=node.name + "_Q4" if node.name else "",
+                domain="com.microsoft",
+                **kwargs,
+            )
 
-        logger.info(f"complete quantization of {node.name} ...")
+            output_nodes.append(matmul_q4_node)
+        else:
+            dq_input_names = [b_quant.name, scales_tensor.name]
+            dq_output_names = [b_quant.name + "_output"]
+            matmul_input_names = [node.input[0], dq_output_names[0]]
+            matmul_output_names = [node.output[0]]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.helper.make_tensor(
+                    b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True
+                )
+                dq_input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
+            dq_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                inputs=dq_input_names,
+                outputs=dq_output_names,
+                name=node.name + "_DQ_Q4" if node.name else "",
+                **dq_kwargs,
+            )
+            matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=matmul_input_names,
+                outputs=matmul_output_names,
+                name=node.name + "_matmul_Q4" if node.name else "",
+            )
+            output_nodes.extend([dq_node, matmul_node])
 
-        return matmul_q4_node
+        logger.info(f"complete quantization of {node.name} ...")
+        return output_nodes
 
 
 class MatMul4BitsQuantizer:
-    """Perform 4b quantization of constant MatMul weights"""
+    """
+    Perform 4b quantization of constant MatMul weights.
+    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
+    MatMul node.
+    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
+    replaced by the DequantizeLinear + MatMul nodes.
+    """
 
     def __init__(
         self,
@@ -475,7 +577,8 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
         nodes_to_exclude=None,
-        algo_config: WeightOnlyQuantConfig = None,
+        quant_format=QuantFormat.QOperator,
+        algo_config: WeightOnlyQuantConfig | None = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
@@ -488,7 +591,10 @@ def __init__(
         self.node_quantizer = None
         if algo_config is None:
             algo_config = DefaultWeightOnlyQuantConfig(
-                block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level
+                block_size=block_size,
+                is_symmetric=is_symmetric,
+                accuracy_level=accuracy_level,
+                quant_format=quant_format,
             )
         self.algo_config = algo_config
         if algo_config.algorithm == "HQQ":
@@ -526,15 +632,15 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
                 node = onnx.helper.make_node(  # noqa: PLW2901
                     node.op_type, node.input, node.output, name=node.name, **kwargs
                 )
-            out_node = None
+            out_nodes = []
             if node.name in self.nodes_to_exclude:
                 logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
-                out_node = node
+                out_nodes = [node]
             elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
-                out_node = self.node_quantizer.quantize(node, graph_stack)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
             else:
-                out_node = self.node_quantizer.quantize(node, graph_stack)
-            new_nodes.append(out_node)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.extend(out_nodes)
 
         graph.ClearField("node")
         graph.node.extend(new_nodes)
@@ -688,6 +794,15 @@ def parse_args():
         default=[],
         help="Specify the nodes to be excluded from quantization with node names",
     )
+    parser.add_argument(
+        "--quant_format",
+        default="QOperator",
+        type=QuantFormat,
+        choices=list(QuantFormat),
+        help="QuantFormat {QOperator, QDQ}"
+        "QOperator format quantizes the model with quantized operators directly."
+        "QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.",
+    )
 
     return parser.parse_args()
 
@@ -699,6 +814,7 @@ def parse_args():
 
     input_model_path = args.input_model
     output_model_path = args.output_model
+    quant_format = args.quant_format
 
     if os.path.exists(output_model_path):
         logger.error(f"file {output_model_path} already exists")
@@ -713,7 +829,10 @@ def parse_args():
         quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
     elif args.quant_method == "default":
         quant_config = DefaultWeightOnlyQuantConfig(
-            block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level
+            block_size=args.block_size,
+            is_symmetric=args.symmetric,
+            accuracy_level=args.accuracy_level,
+            quant_format=quant_format,
         )
     elif args.quant_method == "rtn":
         quant_config = RTNWeightOnlyQuantConfig()
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 9ee8f27df5c99..2f335009b59c6 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,12 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
-    "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
     "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
     "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0",
-    "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0",
+    "10.2.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.2.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index 6fba98c14e792..cd8a8756d681e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -27,8 +27,6 @@ Please note the package versions needed for using LLaMA-2 in the `requirements.t
   - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
 - `requirements-quant.txt`
   - For running the SmoothQuant algorithm using [Intel's Neural Compressor](https://github.com/intel/neural-compressor)
-- `requirements-70b-model.txt`
-  - For running the LLaMA-2 70B model on multiple GPUs
 - `requirements.txt`
   - Package versions needed in each of the above files
 
@@ -221,18 +219,6 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu --use_gqa
 ```
 
-Export LLaMA-2 70B sharded model into 4 partitions
-```
-# From source:
-# 1. Install necessary packages from requirements-70b-model.txt
-$ pip install -r requirements-70b-model.txt
-
-# 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command:
-$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/
-
-# 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command:
-$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
-```
 
 ## Parity Checking LLaMA-2
 
@@ -395,18 +381,6 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --device cuda
 ```
 
-9. ONNX Runtime, FP16, convert_to_onnx, LLaMA-2 70B shard to 4 GPUs
-```
-CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
-    --benchmark-type ort-convert-to-onnx \
-    --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
-    --model-name meta-llama/Llama-2-70b-hf \
-    --cache-dir ./model_cache \
-    --precision fp16 \
-    --device cuda \
-    --warmup-runs 5 \
-    --num-runs 100
-```
 
 You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination.
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
deleted file mode 100644
index 38f1916456658..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python benchmark.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
deleted file mode 100644
index 637d15c10e0c7..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python convert_to_onnx.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
deleted file mode 100644
index 572cfdb71be4a..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
+++ /dev/null
@@ -1,4 +0,0 @@
--r requirements.txt
-git+https://github.com/frankdongms/transformers.git@frdong/shard_llama
-mpi4py
-psutil
\ No newline at end of file
diff --git a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
index 32f2da806be3b..467c5e773589a 100644
--- a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
+++ b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
@@ -12,7 +12,6 @@
 #include "core/graph/graph_flatbuffers_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
-
 #include "test/flatbuffers/flatbuffers_utils_test.fbs.h"
 
 #include "test/util/include/asserts.h"
@@ -116,6 +115,10 @@ ONNX_NAMESPACE::TensorProto CreateInitializer(const std::string& name,
       ORT_THROW("Unsupported data type: ", data_type);
   }
 
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto(&tp);
+  }
+
   return tp;
 }
 
@@ -258,6 +261,9 @@ TEST(FlatbufferUtilsTest, ExternalWriteReadWithLoadInitializers) {
   for (const auto* fbs_tensor : *fbs_tensors2) {
     ONNX_NAMESPACE::TensorProto initializer;
     ASSERT_STATUS_OK(LoadInitializerOrtFormat(*fbs_tensor, initializer, options, reader));
+    if constexpr (endian::native != endian::little) {
+      utils::ConvertRawDataInTensorProto(&initializer);
+    }
     loaded_initializers.emplace_back(std::move(initializer));
     // also check that the loaded flatbuffer tensors have accurately written to the external_data_offset field
     if (fbs_tensor->data_type() != fbs::TensorDataType::STRING && fbs_tensor->name()->str() != "tensor_32_small") {
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index fa42bb6e96cd5..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -705,6 +705,9 @@ struct InsertIndices {
       // Conversion on the fly to the target data type
       std::vector<T> indices(indices_data.cbegin(), indices_data.cend());
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices.data()), indices.size() * sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&indices_tp);
+      }
     }
   }
 };
@@ -837,7 +840,7 @@ static void TestConversion(
 template <typename T>
 static void RawDataWriter(const std::vector<T>& values, TensorProto& tp, TensorProto_DataType datatype) {
   tp.set_data_type(datatype);
-  tp.set_raw_data(values.data(), values.size() * sizeof(T));
+  utils::SetRawDataInTensorProto(tp, values.data(), values.size() * sizeof(T));
 }
 
 int64_t ActualSize(const TensorProto& actual) {
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 05bdb3a9a033d..6821f582ce2de 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -30,7 +30,7 @@ void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::pat
   for (int i = 0; i < 4; ++i) {
     memcpy(rawdata + i * sizeof(T), &(f[i]), sizeof(T));
   }
-  float_tensor_proto.set_raw_data(rawdata, len);
+  utils::SetRawDataInTensorProto(float_tensor_proto, rawdata, len);
   T float_data2[4];
   auto status = UnpackTensor(float_tensor_proto, model_path, float_data2, 4);
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
@@ -102,8 +102,25 @@ std::vector<BFloat16> CreateValues<BFloat16>() {
   return {BFloat16(0.f), BFloat16(1.f), BFloat16(2.f), BFloat16(3.f)};
 }
 
+template <typename T>
+void ConvertEndianessForVector(const std::vector<T>& test_data) {
+  const size_t element_size = sizeof(T);
+  const size_t num_elements = test_data.size();
+  char* bytes = reinterpret_cast<char*>(const_cast<T*>(test_data.data()));
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+}
+
 template <typename T>
 void WriteDataToFile(FILE* fp, const std::vector<T>& test_data) {
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(test_data);
+  }
   size_t size_in_bytes = test_data.size() * sizeof(T);
   ASSERT_EQ(size_in_bytes, fwrite(test_data.data(), 1, size_in_bytes, fp));
 }
@@ -147,6 +164,9 @@ void UnpackAndValidate(const TensorProto& tensor_proto, const std::filesystem::p
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tensor_proto, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
 
   // Validate data
   for (size_t i = 0; i < test_data.size(); i++) {
@@ -325,6 +345,9 @@ static void TestConstantNodeConversionWithExternalData(TensorProto_DataType type
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tp, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
   for (size_t i = 0; i < test_data.size(); i++) {
     ASSERT_EQ(val[i], test_data[i]);
   }
diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc
index 17edad73085c9..73bf351b6c556 100644
--- a/onnxruntime/test/framework/test_tensor_loader.cc
+++ b/onnxruntime/test/framework/test_tensor_loader.cc
@@ -104,6 +104,18 @@ static void run_external_data_test() {
   std::unique_ptr<ORTCHAR_T, decltype(&DeleteFileFromDisk)> file_deleter(const_cast<ORTCHAR_T*>(filename.c_str()),
                                                                          DeleteFileFromDisk);
   float test_data[] = {1.0f, 2.2f, 3.5f};
+  if constexpr (endian::native != endian::little) {
+    const int element_size = sizeof(float);
+    char* bytes = reinterpret_cast<char*>(test_data);
+    const size_t num_elements = std::size(test_data);
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
+  }
   ASSERT_EQ(sizeof(test_data), fwrite(test_data, 1, sizeof(test_data), fp));
   ASSERT_EQ(0, fclose(fp));
   // construct a tensor proto
@@ -128,8 +140,12 @@ static void run_external_data_test() {
     len = GetCurrentDirectoryW(len, (ORTCHAR_T*)cwd.data());
     ASSERT_NE(len, (DWORD)0);
     cwd.append(ORT_TSTR("\\fake.onnx"));
+#else
+#if defined(_AIX)
+    char* p = getcwd(nullptr, PATH_MAX);
 #else
     char* p = getcwd(nullptr, 0);
+#endif
     ASSERT_NE(p, nullptr);
     cwd = p;
     free(p);
diff --git a/onnxruntime/test/mlas/bench/bench_q4dq.cpp b/onnxruntime/test/mlas/bench/bench_q4dq.cpp
index 00234ecfd2ce2..9d15c9a6bf994 100644
--- a/onnxruntime/test/mlas/bench/bench_q4dq.cpp
+++ b/onnxruntime/test/mlas/bench/bench_q4dq.cpp
@@ -69,6 +69,7 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state
   int N = state.range(1);
   int quant_block_size = state.range(2);
   int threads = state.range(3);
+  bool add8 = state.range(4) != 0;
   int quant_num_M = (M + quant_block_size - 1) / quant_block_size;
   int blob_size = (quant_block_size + 1) / 2;
   size_t scale_size = quant_num_M * N;
@@ -87,12 +88,22 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state
       onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(),
                                                  tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
 
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(dst.data());
-    MlasQDQTransposeBlockwiseQuantized<float, 4>(
-        dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
-        true, M, N, quant_block_size, tp.get());
-    benchmark::ClobberMemory();
+  if (add8) {
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(dst.data());
+      MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+          dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
+          true, M, N, quant_block_size, tp.get());
+      benchmark::ClobberMemory();
+    }
+  } else {
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(dst.data());
+      MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+          dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(),
+          true, M, N, quant_block_size, tp.get());
+      benchmark::ClobberMemory();
+    }
   }
 }
 
@@ -113,6 +124,6 @@ BENCHMARK(BM_MlasQuantizeBlockwise)
 BENCHMARK(BM_QDQBlockwiseQuantizer_TransposeColumnwise)
     ->UseRealTime()
     ->Apply([](benchmark::internal::Benchmark* b) {
-      b->ArgNames({"M", "N", "quant_block_size", "threads"});
-      b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}});
+      b->ArgNames({"M", "N", "quant_block_size", "threads", "add8"});
+      b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}, {0, 1}});
     });
diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
index b466e883059f4..f75002f715154 100644
--- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp
+++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
@@ -127,13 +127,22 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
                                     columnwise, rows, columns, columns, threadpool_ptr);
 
     if (columnwise) {
-      MlasQDQQuantizeBlockwise<float, 4>(
+      bool signed_quant = MlasQDQQuantizeBlockwise<float, 4>(
           transposed, qdq_scales, qdq_zp, qdq_weights,
           true, rows, columns, block_size, threadpool_ptr);
 
-      MlasQDQTransposeBlockwiseQuantized<float, 4>(
-          qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
-          true, rows, columns, block_size, threadpool_ptr);
+      ASSERT_EQ(symmetric, signed_quant) << "symmetric quantization should be signed";
+
+      if (symmetric) {
+        MlasQDQTransposeBlockwiseQuantized<float, 4, true>(
+            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+            true, rows, columns, block_size, threadpool_ptr);
+
+      } else {
+        MlasQDQTransposeBlockwiseQuantized<float, 4, false>(
+            qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T,
+            true, rows, columns, block_size, threadpool_ptr);
+      }
     }
 
     for (int c = 0; c < columns; c++) {
diff --git a/onnxruntime/test/onnx/TestCase.h b/onnxruntime/test/onnx/TestCase.h
index 0cb92056d378e..745a1fe9eeb50 100644
--- a/onnxruntime/test/onnx/TestCase.h
+++ b/onnxruntime/test/onnx/TestCase.h
@@ -53,7 +53,8 @@ class TestModelInfo {
  public:
   virtual const std::filesystem::path& GetModelUrl() const = 0;
   virtual std::filesystem::path GetDir() const {
-    return GetModelUrl().parent_path();
+    const auto& p = GetModelUrl();
+    return p.has_parent_path() ? p.parent_path() : std::filesystem::current_path();
   }
   virtual const std::string& GetNodeName() const = 0;
   virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0;
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index fc29756a1ff98..9886d98dcc6d6 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -8,6 +8,8 @@
 #include <unordered_map>
 #ifdef _WIN32
 #include "getopt.h"
+#elif defined(_AIX)
+#include <thread>
 #else
 #include <getopt.h>
 #include <thread>
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 5df055f862a86..50ab2290c6456 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <utility>
 
 #include "mem_buffer.h"
 #include "core/common/safeint.h"
@@ -68,11 +69,22 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
     ORT_CXX_API_THROW(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ",
                                  expected_size_in_bytes, ", got ", raw_data_length),
                       OrtErrorCode::ORT_FAIL);
+  memcpy(p_data, raw_data, raw_data_length);
   if constexpr (endian::native != endian::little) {
-    ORT_CXX_API_THROW("UnpackTensorWithRawData only handles little-endian native byte order for now.",
-                      OrtErrorCode::ORT_NOT_IMPLEMENTED);
+    /* Convert Endianness */
+    char* bytes = reinterpret_cast<char*>(p_data);
+    size_t element_size = sizeof(T);
+    size_t num_elements = raw_data_length / element_size;
+
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      /* keep swapping */
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
   }
-  memcpy(p_data, raw_data, raw_data_length);
 }
 
 template <>
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 2bfa57a2ceb9e..3e4e845440117 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -4972,8 +4972,8 @@ TEST_F(GraphTransformationTests, CseWithConstantOfShape) {
     TensorProto value_tensor;
     value_tensor.add_dims(1);
     float value = 2.333f;
-    value_tensor.set_raw_data(reinterpret_cast<const char*>(&value), sizeof(float));
     value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    utils::SetRawDataInTensorProto(value_tensor, reinterpret_cast<const char*>(&value), sizeof(float));
     builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor);
     builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor);
     builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1});
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
index 73c8b3f119103..2cbfbbb317642 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
@@ -61,7 +61,7 @@ NodeArg* ModelTestBuilder::MakeInitializer(gsl::span<const int64_t> shape,
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_name(name);
   tensor_proto.set_data_type(elem_type);
-  tensor_proto.set_raw_data(raw_data.data(), raw_data.size());
+  utils::SetRawDataInTensorProto(tensor_proto, raw_data.data(), raw_data.size());
 
   for (auto& dim : shape) {
     tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 0282d09f340b2..6214094a26c4f 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -13,6 +13,7 @@
 #include "core/framework/int4.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/framework/test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
@@ -249,7 +250,7 @@ class ModelTestBuilder {
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<bool>());
     std::unique_ptr<bool[]> data_buffer = std::make_unique<bool[]>(data.size());
     for (size_t i = 0; i < data.size(); ++i) data_buffer[i] = data[i];
-    tensor_proto.set_raw_data(data_buffer.get(), data.size());
+    utils::SetRawDataInTensorProto(tensor_proto, data_buffer.get(), data.size());
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 522e96e762d5a..391942acfca35 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -163,8 +163,8 @@ void TestInitializerRawData() {
   tensor_proto.set_name("OptimizerInitializerTest_RawData");
   tensor_proto.add_dims(3);
   tensor_proto.add_dims(4);
-  tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
 
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
   const Initializer init(tensor_proto, std::filesystem::path());
 
   for (size_t idx = 0; idx < data.size(); idx++) {
diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
index 8e4edc9e0abbb..538f60040418c 100644
--- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
@@ -6,6 +6,7 @@
 #include "core/mlas/inc/mlas.h"
 #include "core/session/environment.h"
 #include "core/session/inference_session.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/compare_ortvalue.h"
 #include "test/test_environment.h"
 #include "test/framework/test_utils.h"
@@ -62,7 +63,7 @@ struct NchwcTestHelper {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<T>());
-    tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
+    utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 1db8616c85daa..01de15e6f8ec8 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -73,7 +73,7 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) {
       }
     } else {
       auto buffer_size = tensor.DataType()->Size() * shape.Size();
-      tensor_proto.set_raw_data(tensor.DataRaw(), buffer_size);
+      utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size);
     }
 
     // 4. name
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index be049d1cf0ce3..ec9b1614488a7 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -256,7 +256,7 @@ TEST(Random, MultinomialGoodCase) {
   const std::vector<int64_t> output_dims{batch_size, num_samples};
 #ifdef _WIN32
   const std::vector<int64_t> expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int64_t> expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
 #else
   const std::vector<int64_t> expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0};
@@ -294,7 +294,7 @@ TEST(Random, MultinomialDefaultDType) {
 #ifdef _WIN32
   const std::vector<int32_t> expected_output_1{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
   const std::vector<int32_t> expected_output_2{0, 0, 1, 0, 2, 2, 2, 0, 2, 1, 2, 1, 0, 2, 0, 2, 2, 1, 2, 1};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int32_t> expected_output_1{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
   const std::vector<int32_t> expected_output_2{1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 2, 2, 2, 1};
 #else
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index bd97306142f18..4fc2e6c7c909b 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -18,13 +18,17 @@ constexpr double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
 constexpr double DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
 
 template <typename T>
-void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output, bool skip_trt = false) {
   OpTester test("IsInf", opset);
   test.AddAttribute<int64_t>("detect_positive", detect_positive);
   test.AddAttribute<int64_t>("detect_negative", detect_negative);
   test.AddInput<T>("X", {onnxruntime::narrow<int64_t>(input.size())}, input);
   test.AddOutput<bool>("Y", {onnxruntime::narrow<int64_t>(output.size())}, output);
-  test.Run();
+  if (skip_trt) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
+    test.Run();
+  }
 }
 
 TEST(IsInfTest, test_isinf_float10) {
@@ -124,7 +128,7 @@ TEST(IsInfTest, test_isinf_bfloat16) {
   std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
                                            BFloat16::NegativeInfinity, BFloat16::Infinity};
   std::initializer_list<bool> output = {false, false, true, false, true, true};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_isinf_positive_bfloat16) {
@@ -146,7 +150,7 @@ TEST(IsInfTest, test_Float8E4M3FN) {
   std::initializer_list<Float8E4M3FN> input = {
       Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)};
   std::initializer_list<bool> output = {false, false, false, false, false, false};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10.1 supports Float8 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_Float8E4M3FNUZ) {
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index fa1c739c04e3a..f96c8ce9ce729 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -13,7 +13,7 @@
  */
 
 #pragma once
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "test/cuda_host/blkq4_fp16_quant_sm80.h"
 
 #include <random>
@@ -197,3 +197,4 @@ void run_blkq4_small_gemm(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index b95e093e41eab..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -10,7 +10,7 @@
  *   This part requires gtest header files, which do not play
  *   well with CUTLASS headers.
  */
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include "gtest/gtest.h"
@@ -341,3 +341,4 @@ TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
 
 }  // namespace test
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index f5600ca9885a3..8b27c3d8c3aed 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,6 +11,9 @@
  *   well with gtest headers.
  */
 
+// This test has build error with cuda 12.5
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
+
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include <random>
@@ -532,3 +535,5 @@ template void run_blkq4_small_gemm<128, false, false>(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 88e5052db4e2e..4cc8a0c151d14 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -14,7 +14,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import quant_utils
 
@@ -105,8 +105,9 @@ def make_matmul(
             [output_tensor],
             initializer=initializers,
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
+        # blocked quantization requires DQ op set >= 21
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
+        model.ir_version = 10  # use stable onnx ir version
 
         onnx.save(model, output_model_path)
 
@@ -116,9 +117,12 @@ def quant_test(
         data_reader: TestDataFeeds,
         block_size: int,
         is_symmetric: bool,
+        quant_format: quant_utils.QuantFormat = quant_utils.QuantFormat.QOperator,
     ):
+        use_qdq = quant_format == quant_utils.QuantFormat.QDQ
+        name_prefix = "DQ_MatMul" if use_qdq else "MatMulNBits"
         model_int4_path = str(
-            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+            Path(self._tmp_model_dir.name).joinpath(f"{name_prefix}_{block_size}_{is_symmetric}.onnx").absolute()
         )
 
         # Quantize fp32 model to int4 model
@@ -126,15 +130,33 @@ def quant_test(
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
         quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
-            block_size=block_size, is_symmetric=is_symmetric
+            block_size=block_size, is_symmetric=is_symmetric, quant_format=quant_format
         )
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
-        quant_nodes = {"MatMulNBits": 1}
+        quant_nodes = {"DequantizeLinear": 1, "MatMul": 1} if use_qdq else {"MatMulNBits": 1}
         check_op_type_count(self, model_int4_path, **quant_nodes)
 
+        if use_qdq:
+            dq_qtype = TensorProto.INT4 if is_symmetric else TensorProto.UINT4
+            dqnode_io_qtypes = (
+                {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                    ]
+                }
+                if is_symmetric
+                else {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                        ["i", 2, dq_qtype],
+                    ]
+                }
+            )
+            check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes)
+
         data_reader.rewind()
 
         try:
@@ -211,6 +233,26 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_symmetric_qdq(self):
+        np.random.seed(13)
+
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=True)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, True, quant_utils.QuantFormat.QDQ)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_offsets_qdq(self):
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, False, quant_utils.QuantFormat.QDQ)
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
index fe7e39722237f..880f4175e00b7 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
@@ -35,8 +35,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_prompt_no_buff(
             config,
@@ -45,8 +45,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
     @parameterized.expand(gqa_past_flash_attention_test_cases())
@@ -67,8 +67,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_past_no_buff(
             config,
@@ -77,8 +77,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
 
diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
new file mode 100644
index 0000000000000..05fcf08cd3232
--- /dev/null
+++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
@@ -0,0 +1,57 @@
+{
+    "steps": [
+        {
+            "step": 20,
+            "loss": 2.0136
+        },
+        {
+            "step": 40,
+            "loss": 1.8466
+        },
+        {
+            "step": 60,
+            "loss": 1.7525
+        },
+        {
+            "step": 80,
+            "loss": 1.6682
+        },
+        {
+            "step": 100,
+            "loss": 1.658
+        },
+        {
+            "step": 120,
+            "loss": 1.6749
+        },
+        {
+            "step": 140,
+            "loss": 1.6263
+        },
+        {
+            "step": 160,
+            "loss": 1.6828
+        },
+        {
+            "step": 180,
+            "loss": 1.6145
+        },
+        {
+            "step": 200,
+            "loss": 1.6197
+        },
+        {
+            "step": 220,
+            "loss": 1.6353
+        },
+        {
+            "step": 240,
+            "loss": 1.5266
+        },
+        {
+            "step": 260,
+            "loss": 1.5441
+        }
+    ],
+    "samples_per_second": 34.561
+}
diff --git a/setup.py b/setup.py
index 5750833ce35de..51feedcfd3286 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@ def parse_arg_remove_string(argv, arg_name_equal):
 
 cuda_version = None
 rocm_version = None
+is_migraphx = False
 is_rocm = False
 is_openvino = False
 # The following arguments are mutually exclusive
@@ -64,8 +65,9 @@ def parse_arg_remove_string(argv, arg_name_equal):
     cuda_version = parse_arg_remove_string(sys.argv, "--cuda_version=")
 elif parse_arg_remove_boolean(sys.argv, "--use_rocm"):
     is_rocm = True
-    package_name = "onnxruntime-rocm" if not nightly_build else "ort-rocm-nightly"
     rocm_version = parse_arg_remove_string(sys.argv, "--rocm_version=")
+elif parse_arg_remove_boolean(sys.argv, "--use_migraphx"):
+    is_migraphx = True
 elif parse_arg_remove_boolean(sys.argv, "--use_openvino"):
     is_openvino = True
     package_name = "onnxruntime-openvino"
@@ -87,6 +89,9 @@ def parse_arg_remove_string(argv, arg_name_equal):
 elif parse_arg_remove_boolean(sys.argv, "--use_qnn"):
     package_name = "onnxruntime-qnn"
 
+if is_rocm or is_migraphx:
+    package_name = "onnxruntime-rocm" if not nightly_build else "ort-rocm-nightly"
+
 # PEP 513 defined manylinux1_x86_64 and manylinux1_i686
 # PEP 571 defined manylinux2010_x86_64 and manylinux2010_i686
 # PEP 599 defines the following platform tags:
@@ -280,10 +285,21 @@ def finalize_options(self):
         return ret
 
 
-providers_cuda_or_rocm = "libonnxruntime_providers_" + ("rocm.so" if is_rocm else "cuda.so")
-providers_tensorrt_or_migraphx = "libonnxruntime_providers_" + ("migraphx.so" if is_rocm else "tensorrt.so")
-providers_openvino = "libonnxruntime_providers_openvino.so"
-providers_cann = "libonnxruntime_providers_cann.so"
+providers_cuda_or_rocm = "onnxruntime_providers_" + ("rocm" if is_rocm else "cuda")
+providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
+providers_openvino = "onnxruntime_providers_openvino"
+providers_cann = "onnxruntime_providers_cann"
+
+if platform.system() == "Linux":
+    providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so"
+    providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so"
+    providers_openvino = "lib" + providers_openvino + ".so"
+    providers_cann = "lib" + providers_cann + ".so"
+elif platform.system() == "Windows":
+    providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
+    providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
+    providers_openvino = providers_openvino + ".dll"
+    providers_cann = providers_cann + ".dll"
 
 # Additional binaries
 dl_libs = []
@@ -335,6 +351,9 @@ def finalize_options(self):
         "dnnl.dll",
         "mklml.dll",
         "libiomp5md.dll",
+        providers_cuda_or_rocm,
+        providers_tensorrt_or_migraphx,
+        providers_cann,
         "onnxruntime.dll",
     ]
     # DNNL, TensorRT & OpenVINO EPs are built as shared libs
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index ae4c9b27544ba..75fbf5d0851ae 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -613,6 +613,7 @@ def convert_arg_line_to_args(self, arg_line):
             "MinGW Makefiles",
             "Ninja",
             "NMake Makefiles",
+            "NMake Makefiles JOM",
             "Unix Makefiles",
             "Visual Studio 17 2022",
             "Xcode",
@@ -2211,6 +2212,7 @@ def build_python_wheel(
     use_cuda,
     cuda_version,
     use_rocm,
+    use_migraphx,
     rocm_version,
     use_dnnl,
     use_tensorrt,
@@ -2262,6 +2264,8 @@ def build_python_wheel(
             args.append("--use_rocm")
             if rocm_version:
                 args.append(f"--rocm_version={rocm_version}")
+        elif use_migraphx:
+            args.append("--use_migraphx")
         elif use_openvino:
             args.append("--use_openvino")
         elif use_dnnl:
@@ -2587,9 +2591,6 @@ def main():
     if args.use_tensorrt:
         args.use_cuda = True
 
-    if args.use_migraphx:
-        args.use_rocm = True
-
     if args.build_wheel or args.gen_doc or args.use_tvm or args.enable_training:
         args.enable_pybind = True
 
@@ -2885,7 +2886,8 @@ def main():
     # fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it
     # either.
     if args.build:
-        # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and the target OS is Windows
+        # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and
+        #  the target OS is Windows
         if args.build_wheel:
             nightly_build = bool(os.getenv("NIGHTLY_BUILD") == "1")
             default_training_package_device = bool(os.getenv("DEFAULT_TRAINING_PACKAGE_DEVICE") == "1")
@@ -2896,6 +2898,7 @@ def main():
                 args.use_cuda,
                 args.cuda_version,
                 args.use_rocm,
+                args.use_migraphx,
                 args.rocm_version,
                 args.use_dnnl,
                 args.use_tensorrt,
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index dec05ae066a4a..1bbb933f66ba4 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -18,3 +18,6 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Relu||
 |ai.onnx:Reshape||
 |ai.onnx:Sub||
+|ai.onnx:Sigmoid||
+|ai:onnx:Tanh||
+|ai:onnx:Transpose||
diff --git a/tools/ci_build/github/apple/test_ios_framework_build_settings.json b/tools/ci_build/github/apple/test_ios_framework_build_settings.json
new file mode 100644
index 0000000000000..0572df6ecf72e
--- /dev/null
+++ b/tools/ci_build/github/apple/test_ios_framework_build_settings.json
@@ -0,0 +1,30 @@
+{
+    "build_osx_archs": {
+        "iphoneos": [
+            "arm64"
+        ],
+        "iphonesimulator": [
+            "arm64",
+            "x86_64"
+        ]
+    },
+    "build_params": {
+        "base": [
+            "--parallel",
+            "--use_xcode",
+            "--build_apple_framework",
+            "--use_coreml",
+            "--use_xnnpack",
+            "--skip_tests",
+            "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
+        ],
+        "iphoneos": [
+            "--ios",
+            "--apple_deploy_target=13.0"
+        ],
+        "iphonesimulator": [
+            "--ios",
+            "--apple_deploy_target=13.0"
+        ]
+    }
+}
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
index 72f236ec2e6cc..10d9a9a24d88a 100644
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
@@ -48,12 +48,12 @@ parameters:
 stages:
 # Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate
 # issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code.
-- stage: BUILD_CPU_STAGE
+- stage: BUILD_AND_TEST_CPU
   dependsOn: []
   variables:
     Codeql.Enabled: false
   jobs:
-  - job: Build_CPU_EP
+  - job: BUILD_AND_TEST_CPU
     pool: onnxruntime-Ubuntu2204-AMD-CPU
     workspace:
       clean: all
@@ -78,12 +78,14 @@ stages:
     - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
       displayName: Install coreutils and ninja
 
-    - template: "templates/use-android-ndk.yml"
-
+    - template: templates/use-android-ndk.yml
+    - template: templates/use-android-emulator.yml
+      parameters:
+        create: true
+        start: true
     - script: |
         env | grep ANDROID
       displayName: View Android ENVs
-
     - script: |
         python3 tools/ci_build/build.py \
           --enable_lto \
@@ -96,42 +98,17 @@ stages:
           --skip_submodule_sync \
           --parallel \
           --cmake_generator=Ninja \
-          --build_java \
-          --skip_tests
-      displayName: CPU EP, Build
-
-    - task: CopyFiles@2
-      displayName: Copy apks
-      inputs:
-        contents: 'build/**/*.apk'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test data
-      inputs:
-        contents: 'build/**/testdata/**'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test executables
-      inputs:
-        contents: |
-          build/Debug/*
-          build/Debug/java/androidtest/android/**
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathToPublish: $(Build.ArtifactStagingDirectory)
-        artifactName: CPUBuildOutput
+          --build_java
+      displayName: CPU EP, Build and Test
+    - template: templates/use-android-emulator.yml
+      parameters:
+        stop: true
 
     - template: templates/clean-agent-build-directory-step.yml
 
-- stage: BUILD_NNAPI_STAGE
+- stage: BUILD_AND_TEST_NNAPI_EP
   dependsOn: []
+  condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
   variables:
     Codeql.ProjectConfigPath: .github/workflows
     Codeql.Enabled: true
@@ -140,14 +117,12 @@ stages:
       JobsTimeout: 120
     ${{ else }}:
       JobsTimeout: 60
-
   jobs:
-  - job: Build_NNAPI_EP
+  - job: BUILD_AND_TEST_NNAPI_EP
     pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: ${{ variables.JobsTimeout }}
     workspace:
       clean: all
-    condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
     steps:
     - task: UsePythonVersion@0
       displayName: Use Python $(pythonVersion)
@@ -163,8 +138,10 @@ stages:
 
     - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
       displayName: Install coreutils and ninja
-
-    - template: "templates/use-android-ndk.yml"
+    - template: templates/use-android-emulator.yml
+      parameters:
+        create: true
+        start: true
 
     - script: |
         env | grep ANDROID
@@ -172,194 +149,31 @@ stages:
 
     - script: |
         python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --skip_submodule_sync \
-          --parallel \
-          --use_nnapi \
-          --cmake_generator=Ninja \
-          --build_java \
-          --skip_tests
-      displayName: NNAPI EP, Build
-
-    - task: CopyFiles@2
-      displayName: Copy apks
-      inputs:
-        contents: 'build_nnapi/**/*.apk'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy test data
-      inputs:
-        contents: 'build_nnapi/**/testdata/**'
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: CopyFiles@2
-      displayName: Copy Test Executables
-      inputs:
-        contents: |
-          build_nnapi/Debug/*
-          build_nnapi/Debug/java/androidtest/android/**
-        targetFolder: $(Build.ArtifactStagingDirectory)
-        overWrite: true
-
-    - task: PublishBuildArtifacts@1
-      inputs:
-        pathToPublish: $(Build.ArtifactStagingDirectory)
-        artifactName: NNAPIBuildOutput
+        --enable_lto \
+        --android \
+        --build_dir build_nnapi \
+        --android_sdk_path $ANDROID_HOME \
+        --android_ndk_path $ANDROID_NDK_HOME \
+        --android_abi=x86_64 \
+        --android_api=29 \
+        --skip_submodule_sync \
+        --parallel \
+        --use_nnapi \
+        --build_shared_lib \
+        --cmake_generator=Ninja \
+        --build_java
+      displayName: NNAPI EP, Build, Test on Android Emulator
+
+    - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
+      # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
+      displayName: Build Minimal ORT with NNAPI and run tests
+
+    - template: templates/use-android-emulator.yml
+      parameters:
+        stop: true
 
     - template: templates/clean-agent-build-directory-step.yml
 
-- stage: TEST_STAGE
-  dependsOn: [BUILD_CPU_STAGE, BUILD_NNAPI_STAGE]
-  jobs:
-  - job: Test_CPU_EP
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
-    workspace:
-      clean: all
-    condition: succeeded()
-    steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
-      - task: DownloadPipelineArtifact@2
-        inputs:
-          ${{ if eq(parameters.specificArtifact, true) }}:
-            source: 'specific'
-            project: 'onnxruntime'
-            pipeline: $(Build.DefinitionName)
-            runVersion: 'specific'
-            runId: ${{ parameters.runId }}
-          ${{ if ne(parameters.specificArtifact, true) }}:
-            source: 'current'
-          artifact: 'CPUBuildOutput'
-          path: $(Build.SourcesDirectory)
-
-      - task: UsePythonVersion@0
-        displayName: Use Python $(pythonVersion)
-        inputs:
-          versionSpec: $(pythonVersion)
-
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 11
-        inputs:
-          versionSpec: '11'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: "templates/use-android-ndk.yml"
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=30 \
-          --build_java \
-          --test
-        displayName: CPU EP, Test on Android Emulator
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
-  - job: Test_NNAPI_EP
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
-    timeoutInMinutes: 90
-    workspace:
-      clean: all
-    condition: and(succeeded(), notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
-    steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
-      - task: DownloadPipelineArtifact@2
-        inputs:
-          ${{ if eq(parameters.specificArtifact, true) }}:
-            source: 'specific'
-            project: 'onnxruntime'
-            pipeline: $(Build.DefinitionName)
-            runVersion: 'specific'
-            runId: ${{ parameters.runId }}
-          ${{ if ne(parameters.specificArtifact, true) }}:
-            source: 'current'
-          artifact: 'NNAPIBuildOutput'
-          path: $(Build.SourcesDirectory)
-
-      - task: UsePythonVersion@0
-        displayName: Use Python $(pythonVersion)
-        inputs:
-          versionSpec: $(pythonVersion)
-
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 11
-        inputs:
-          versionSpec: '11'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: "templates/use-android-ndk.yml"
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --build_java \
-          --use_nnapi \
-          --test
-        displayName: NNAPI EP, Test, CodeCoverage on Android Emulator
-
-      # used by Build Minimal ORT
-      - script: brew install coreutils ninja
-        displayName: Install coreutils and ninja
-
-      - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-        # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-        displayName: Build Minimal ORT with NNAPI and run tests
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
 - stage: MASTER_BUILD_STAGE
   # The below jobs only run on master build.
   # because coverage report is hard to support in cross machines.
@@ -368,20 +182,12 @@ stages:
   condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
   jobs:
   - job: NNAPI_EP_MASTER
-    pool:
-      # We need macOS-12 to run the Android emulator for now.
-      # https://github.com/actions/runner-images/issues/7671
-      vmImage: 'macOS-12'
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     timeoutInMinutes: 180
     workspace:
       clean: all
     condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
     steps:
-      - script: |
-          set -ex
-          system_profiler SPSoftwareDataType SPHardwareDataType
-        displayName: 'Mac Agent Info'
-
       - task: UsePythonVersion@0
         displayName: Use Python $(pythonVersion)
         inputs:
@@ -394,11 +200,7 @@ stages:
           jdkArchitectureOption: 'x64'
           jdkSourceOption: 'PreInstalled'
 
-      - template: "templates/use-android-ndk.yml"
-
-      # used by Build Minimal ORT
-      - script: brew install coreutils ninja
-        displayName: Install coreutils and ninja
+      - template: templates/use-android-ndk.yml
 
       - template: templates/use-android-emulator.yml
         parameters:
@@ -429,50 +231,25 @@ stages:
             --build_dir build_nnapi \
             --android_sdk_path $ANDROID_HOME
         displayName: Retrieve runtime code coverage files from the emulator and analyze
+
       - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
         displayName: Print coverage report
 
-      - task: PublishPipelineArtifact@0
-        displayName: 'Publish code coverage report'
-        inputs:
-            artifactName: "coverage_rpt.txt"
-            targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
-            publishLocation: 'pipeline'
-
       - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
         # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
         displayName: Build Minimal ORT with NNAPI and run tests
 
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
-
-  - job: Update_Dashboard
-    workspace:
-      clean: all
-    variables:
-    - name: skipComponentGovernanceDetection
-      value: true
-    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-    condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI'))
-    dependsOn:
-    - NNAPI_EP_MASTER
-    steps:
-      - task: DownloadPipelineArtifact@0
-        displayName: 'Download code coverage report'
-        inputs:
-          artifactName: 'coverage_rpt.txt'
-          targetPath: '$(Build.BinariesDirectory)'
-
       - task: AzureCLI@2
         displayName: 'Post Android Code Coverage To DashBoard'
         inputs:
           azureSubscription: AIInfraBuild
           scriptType: bash
           scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
-          arguments: '"$(Build.BinariesDirectory)/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
+          arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
           workingDirectory: '$(Build.BinariesDirectory)'
 
+      - template: templates/use-android-emulator.yml
+        parameters:
+          stop: true
+
       - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 41b3c47ba0396..a66828ee5e188 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -43,7 +43,7 @@ variables:
   - name: docker_base_image
     value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
   - name: linux_trt_version
-    value: 10.0.1.6-1.cuda11.8
+    value: 10.2.0.19-1.cuda11.8
   - name: Repository
     value: 'onnxruntimecuda11manylinuxbuild'
 
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 8b386dde7d3a7..700326fe9173c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,7 +83,7 @@ variables:
   value: 11.8
 
 - name: win_trt_home
-  value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+  value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
 - name: win_cuda_home
   value: $(Agent.TempDirectory)\v11.8
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index daf95af438d2b..9fd13b513e5fd 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -68,9 +68,9 @@ variables:
       value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
   - name: win_trt_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 5f63339fb0d00..3f9707ff50519 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -43,9 +43,9 @@ variables:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: 10.0.1.6-1.cuda11.8
+      value: 10.2.0.19-1.cuda11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: 10.0.1.6-1.cuda12.4
+      value: 10.2.0.19-1.cuda12.5
 
 jobs:
 - job: Linux_Build
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f36cd9cfbfca1..6bf6324252fb9 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -36,7 +36,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index b9a5383836447..56e9c73a10a82 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -61,7 +61,7 @@ stages:
           ${{ if eq(parameters.CudaVersion, '12.2') }}:
             DockerBuildArgs: "
             --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
-            --build-arg TRT_VERSION=10.0.1.6-1+cuda12.4
+            --build-arg TRT_VERSION=10.2.0.19-1+cuda12.5
             --build-arg BUILD_UID=$( id -u )
             "
           ${{ else }}:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 001062452644e..0e1afdcc5b8ca 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -25,7 +25,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
   - name: BuildConfig
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index d6a3fa3147a47..593d45361324e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -226,7 +226,7 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_trt.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -446,14 +446,15 @@ stages:
         python tools/ci_build/github/apple/build_apple_framework.py \
           --build_dir "$(Build.BinariesDirectory)/ios_framework" \
           --build_dynamic_framework \
-          tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+          tools/ci_build/github/apple/test_ios_framework_build_settings.json
       displayName: "Build iOS dynamic framework"
 
     - script: |
         python tools/ci_build/github/apple/test_apple_packages.py \
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
-          --variant Full
+          --variant Full \
+          --skip_macos_test
       displayName: "Test pod with iOS framework"
 
 - stage: IosMinimalTrainingBuild
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 63e70fa8e6488..d57a7585f3cff 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -55,7 +55,7 @@ stages:
       python_wheel_suffix: '_gpu'
       timeout: 480
       docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-      trt_version: '10.0.1.6-1.cuda11.8'
+      trt_version: '10.2.0.19-1.cuda11.8'
       cuda_version: '11.8'
 
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index b6943f9e1b77b..7dfafeb67acf8 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -49,9 +49,9 @@ jobs:
           value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: 10.0.1.6-1.cuda11.8
+          value: 10.2.0.19-1.cuda11.8
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: 10.0.1.6-1.cuda12.4
+          value: 10.2.0.19-1.cuda12.5
     pool: ${{ parameters.machine_pool }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index cca53e36ebab9..2ca5129ac6e5d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -80,9 +80,9 @@ stages:
 
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self
       clean: true
@@ -149,9 +149,9 @@ stages:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
       submodules: false
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index 01f0337be7714..dcd681bd4b915 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -65,9 +65,9 @@ stages:
           SpecificArtifact: ${{ parameters.SpecificArtifact }}
           BuildId: ${{ parameters.BuildId }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
@@ -79,7 +79,7 @@ stages:
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-            trt_version: 10.0.1.6-1.cuda11.8
+            trt_version: 10.2.0.19-1.cuda11.8
           ${{ if eq(parameters.cuda_version, '12.2') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
-            trt_version: 10.0.1.6-1.cuda12.4
+            trt_version: 10.2.0.19-1.cuda12.5
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 0dd9ffd5282e7..de29a3de9fded 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.0.1.6'
+    default: '10.2.0.19'
     values:
       - 8.6.1.6
-      - 10.0.1.6
+      - 10.2.0.19
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,9 +42,9 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.0.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.2.0.19')) }}:
         - powershell: |
-            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.4"
+            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.5"
           displayName: Set trtCudaVersion
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 6c82958fc0b78..63d521f1e7d9a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -24,17 +24,11 @@ steps:
         displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}'
   - ${{ if eq(parameters.DownloadTRT, 'true') }}:
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8'
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5'
       
   - task: BatchScript@1
     displayName: 'setup env'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 97f95797be1f1..6c66cceb33d5c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -22,10 +22,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 3081624225b12..8eca22c8c123f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,10 +18,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 3f1c4ef0f8d61..47980955b8798 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -381,7 +381,7 @@ stages:
       variables:
         CUDA_VERSION: '11.8'
         buildArch: x64
-        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
+        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
         EnvSetupScript: setup_env_gpu.bat
         EP_NAME: gpu
         VSGenerator: 'Visual Studio 17 2022'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 9e14789f3b234..27f85dc5c1648 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -288,7 +288,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.8'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -298,7 +298,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.9'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -308,7 +308,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.10'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -318,7 +318,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.11'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -328,7 +328,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.12'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -498,7 +498,7 @@ stages:
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          trt_version: '10.0.1.6-1.cuda11.8'
+          trt_version: '10.2.0.19-1.cuda11.8'
           cuda_version: '11.8'
 
   - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
index b31882c8da18f..4251a8401f8f0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml
@@ -15,6 +15,25 @@ parameters:
 
 steps:
 - ${{ if eq(parameters.create, true) }}:
+  - script: |
+      if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then
+          echo "${ANDROID_SDK_ROOT}/emulator is in PATH"
+      else
+          ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator"
+          echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/emulator"
+      fi
+    displayName: Check if emulator are installed and add to PATH
+
+  - script: |
+      if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then
+          echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH"
+      else
+          ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools"
+          echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/platform-tools"
+      fi
+      ls -R ${ANDROID_SDK_ROOT}/platform-tools
+    displayName: Check if platform tools are installed and add to PATH
+
   - script: |
       set -e -x
       python3 tools/python/run_android_emulator.py \
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 39e68f5631f01..7d64f78c695fa 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -137,6 +137,25 @@ stages:
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
+# Build only. Does not run any tests.
+- stage: x64_release_vitisai
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        buildArch: x64
+        additionalBuildFlags: --build_wheel --use_vitisai
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_release
+        RunOnnxRuntimeTests: false
+        isTraining: false
+        ORT_EP_NAME: VITISAI
+        GenerateDocumentation: false
+        WITH_CACHE: false
+        MachinePool: 'onnxruntime-Win-CPU-2022'
+
 - stage: x64_release_winml
   dependsOn: []
   jobs:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 1af00da01241a..70c0c7d4a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -55,7 +55,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
-      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: 'x64'
       Platform: 'x64'
@@ -75,7 +75,7 @@ jobs:
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
      set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
 
     workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 86c178aae519b..2d3dc05285e3c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 4542d3a3f2e4c..a50788e98ffe0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 5ef56fd885ca7..1aca3e305452d 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
index 194a22850030c..5697120a48b2b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
deleted file mode 100644
index 8b32425afce1c..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
+++ /dev/null
@@ -1,63 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
-
-
-# ONNX Runtime Variables
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.4.1-1+cuda11.6" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-WORKDIR /code
-ENV CUDA_MODULE_LOADING "LAZY"
-
-# Prepare onnxruntime repository & build onnxruntime with TensorRT
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime &&\
-    /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\
-    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
-    cd .. 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
deleted file mode 100644
index cfc7023ef8e61..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
+++ /dev/null
@@ -1,92 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-# Build base image with required system packages
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
-
-# The local directory into which to build and install CMAKE
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.5.1-1+cuda11.8" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-# Build final image from base. Builds ORT.
-FROM base as final
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-
-# ONNX Runtime arguments
-
-# URL to the github repo from which to clone ORT.
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-
-# The local directory into which to clone ORT.
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-# The git branch of ORT to checkout and build.
-ARG ONNXRUNTIME_BRANCH=main
-
-# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
-ARG ONNXRUNTIME_COMMIT_ID
-
-# The supported CUDA architecture
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
-
-# Clone ORT repository with branch
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
-
-# Reset to a specific commit if specified by build args.
-RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
-    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
-    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
-
-# Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
-RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
-
-# Switch to root to continue following steps of CI
-USER root
-
-# Intall ORT wheel
-RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
similarity index 99%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index cd168e1911d95..0bd56a1a5873f 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda11.8" &&\
+RUN version="10.2.0.19-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
similarity index 83%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 3e48415118c63..7f66943dd8745 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,15 +30,27 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2 psutil
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda12.4" &&\
+RUN version="10.2.0.19-1+cuda12.5" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
-    tensorrt=${version}
+    libnvinfer-dev=${version} \
+    libnvinfer-dispatch-dev=${version} \
+    libnvinfer-dispatch10=${version} \
+    libnvinfer-headers-dev=${version} \
+    libnvinfer-headers-plugin-dev=${version} \
+    libnvinfer-lean-dev=${version} \
+    libnvinfer-lean10=${version} \
+    libnvinfer-plugin-dev=${version} \
+    libnvinfer-plugin10=${version} \
+    libnvinfer-vc-plugin-dev=${version} \
+    libnvinfer-vc-plugin10=${version} \
+    libnvinfer10=${version} \
+    libnvonnxparsers-dev=${version} \
+    libnvonnxparsers10=${version} \
+    tensorrt-dev=${version} \
+    libnvinfer-bin=${version}
 
 # Compile trtexec if not installed
 RUN if [ ! -d /usr/src/tensorrt/bin ] || [ ! -f /usr/src/tensorrt/bin/trtexec ]; then \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index a26bf88fbbdf6..0281c1c8fef25 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,9 +30,6 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
 # Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
 # See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index 3a7f410d3859e..a0020a9827290 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index b94826ae0e4bc..bf21a65314985 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=6.0
+ARG ROCM_VERSION=6.1
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
@@ -77,11 +77,7 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
 RUN export MAJOR=$(cut -d '.' -f 1 <<< "$ROCM_VERSION") && \
     export MINOR=$(cut -d '.' -f 2 <<< "$ROCM_VERSION") && \
     export PATCH=$(cut -d '.' -f 3 <<< "$ROCM_VERSION") && \
-    if (( MAJOR >= 6 )); then \
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${MAJOR}.${MINOR} ; \
-    else \
-        pip install torch==2.0.1 torchvision==0.15.2 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ ; \
-    fi && \
+    pip install torch==2.1.2 torchvision==0.16.1 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ && \
     pip install torch-ort --no-dependencies
 
 ##### Install Cupy to decrease CPU utilization
diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
index b8c00a610b781..6ece3c1f92c4e 100644
--- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
@@ -1,2 +1,2 @@
-azure-kusto-data[pandas]==3.0.1
-azure-kusto-ingest[pandas]==3.0.1
+azure-kusto-data[pandas]==4.5.1
+azure-kusto-ingest[pandas]==4.5.1
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index b753cdae16b90..6c59866ea925a 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 
 @REM The default version is still cuda v11.8, because set cuda v12.2 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 4e43b5999a315..249bb98815897 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
\ No newline at end of file