diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 5db0466497ad1..5555fa692eae8 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1040,7 +1040,7 @@ function(onnxruntime_set_compile_flags target_name)
       # Enable warning
       target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options -Wall>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wall>")
       target_compile_options(${target_name} PRIVATE "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wextra>")
-      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang")
         #external/protobuf/src/google/protobuf/arena.h:445:18: error: unused parameter 'p'
         target_compile_options(${target_name} PRIVATE "-Wno-unused-parameter")
       endif()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 775576a771529..14e6ed515fd6e 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -46,6 +46,9 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(gtest_disable_pthreads ON)
   endif()
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE)
+  endif()
   set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
   if (IOS OR ANDROID)
     # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index aebf9b53f8f05..21ae0947f3788 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -57,6 +57,7 @@ foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
   list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt")
 endforeach()
 
+if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
 add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c
   COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py"
     --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}"
@@ -66,6 +67,7 @@ add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_s
   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+endif()
 if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
@@ -98,13 +100,21 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
     # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file.
   )
 else()
-  onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c)
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc)
+  else()
+    onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
+  endif()
   if (onnxruntime_USE_CUDA)
     set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
   endif()
 endif()
 
-add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES})
+else()
+  add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES})
+endif()
 target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime>")
 
 
@@ -113,7 +123,7 @@ target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
 if(UNIX)
   if (APPLE)
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker -dead_strip")
-  else()
+  elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker --version-script=${SYMBOL_FILE} -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
   endif()
 else()
@@ -132,7 +142,7 @@ if (NOT WIN32)
     else()
         set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path")
     endif()
-  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
   endif()
 endif()
@@ -200,6 +210,10 @@ set(onnxruntime_INTERNAL_LIBRARIES
   onnxruntime_flatbuffers
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  list(APPEND onnxruntime_INTERNAL_LIBRARIES  iconv)
+endif()
+
 if (onnxruntime_USE_EXTENSIONS)
   list(APPEND onnxruntime_INTERNAL_LIBRARIES
     onnxruntime_extensions
@@ -216,15 +230,22 @@ target_link_libraries(onnxruntime PRIVATE
 )
 
 set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
-
 #See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html
 if(NOT APPLE AND NOT WIN32)
-  set_target_properties(onnxruntime PROPERTIES
-    PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
-    LINK_DEPENDS ${SYMBOL_FILE}
-    VERSION ${ORT_VERSION}
-    SOVERSION 1
-    FOLDER "ONNXRuntime")
+  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  else()
+    set_target_properties(onnxruntime PROPERTIES
+      PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}"
+      LINK_DEPENDS ${SYMBOL_FILE}
+      VERSION ${ORT_VERSION}
+      SOVERSION 1
+      FOLDER "ONNXRuntime")
+  endif()
 else()
   # Omit the SOVERSION setting in Windows/macOS/iOS/.. build
   set_target_properties(onnxruntime PROPERTIES
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index c9bf2ac5c3dc6..43d16abd8fbae 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -108,7 +108,7 @@ add_dependencies(onnxruntime_framework ${onnxruntime_EXTERNAL_DEPENDENCIES})
 # For the shared onnxruntime library, this is set in onnxruntime.cmake through CMAKE_SHARED_LINKER_FLAGS
 # But our test files don't use the shared library so this must be set for them.
 # For Win32 it generates an absolute path for shared providers based on the location of the executable/onnxruntime.dll
-if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
 endif()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index df6553e383620..66f4aea606ef5 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -427,12 +427,24 @@ else()
           )
           if(COMPILES_P10)
             check_cxx_source_compiles("
+              #ifdef _AIX
+              #define POWER_10       0x40000
+              #define POWER_10_ANDUP (POWER_10)
+              #include <sys/systemcfg.h>
+              #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+              int main() {
+                bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
+                return 0;
+              }
+              #else
               #include <sys/auxv.h>
               int main() {
                 unsigned long hwcap2 = getauxval(AT_HWCAP2);
                 bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
                 return 0;
-              }"
+              }
+              }
+              #endif"
               HAS_P10_RUNTIME
             )
             if (HAS_P10_RUNTIME)
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index d9fba6f564037..d2afe19f36691 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -247,7 +247,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
-  set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+    endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
   set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 711a9f77f9094..0159c35d1941b 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1225,6 +1225,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       if (CMAKE_SYSTEM_NAME STREQUAL "Android")
         list(APPEND onnxruntime_perf_test_libs ${android_shared_libs})
       endif()
+      if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+        list(APPEND onnxruntime_perf_test_libs onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 gtest absl_failure_signal_handler absl_examine_stack absl_flags_parse  absl_flags_usage absl_flags_usage_internal)
+    endif()
       target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
       if(WIN32)
         target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
@@ -1275,6 +1278,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
     endif()
 
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2)
+    endif()
+
     AddTest(DYN
             TARGET onnxruntime_shared_lib_test
             SOURCES ${onnxruntime_shared_lib_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1510,7 +1517,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.lds -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
     endif()
   else()
@@ -1574,6 +1581,9 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     if (onnxruntime_USE_TENSORRT)
       list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
     endif()
+    if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+    endif()
     AddTest(DYN
             TARGET onnxruntime_customopregistration_test
             SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src}
@@ -1608,7 +1618,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_invalid_library/custom_op_library.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1639,7 +1649,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_get_const_input_test_library/custom_op_lib.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1671,7 +1681,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI
   if(UNIX)
     if (APPLE)
       set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip")
-    else()
+    elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
       string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG
              "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds "
              "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
@@ -1690,6 +1700,9 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
        ${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc)
 
   set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils)
+  if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_framework onnxruntime_common onnxruntime_graph  onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp)
+     endif()
 
   if(NOT WIN32)
     list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS})
@@ -1753,7 +1766,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   if(APPLE)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/exported_symbols.lst")
   elseif(UNIX)
-    set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+      set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN")
+     endif()
   elseif(WIN32)
     set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/symbols.def")
   else()
diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch
index fbe8db37ecb0e..9fb58e301bba8 100644
--- a/cmake/patches/flatbuffers/flatbuffers.patch
+++ b/cmake/patches/flatbuffers/flatbuffers.patch
@@ -10,3 +10,21 @@ index 3987eac9..5e5462f1 100644
 +  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow")
  endif()
  message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h
+index bc828a31..3d3effe8 100644
+--- a/include/flatbuffers/flatbuffers.h
++++ b/include/flatbuffers/flatbuffers.h
+@@ -213,7 +213,12 @@ inline const char * const *ElementaryTypeNames() {
+ // We're explicitly defining the signedness since the signedness of integer
+ // bitfields is otherwise implementation-defined and causes warnings on older
+ // GCC compilers.
+-struct TypeCode {
++
++struct
++#if defined(_AIX) && defined(__clang__)
++__attribute__((packed))
++#endif
++TypeCode {
+   // ElementaryType
+   unsigned short base_type : 4;
+   // Either vector (in table) or array (in struct)
diff --git a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
index ec504d215920f..000c590f32616 100644
--- a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
+++ b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc
@@ -8,6 +8,8 @@
 /* Modifications Copyright (c) Microsoft. */
 
 #include "contrib_ops/cpu/murmur_hash3.h"
+#include <memory>
+#include <utility>
 
 // Platform-specific functions and macros
 
@@ -60,11 +62,31 @@ inline uint64_t rotl64(uint64_t x, int8_t r) {
 // handle aligned reads, do the conversion here
 
 FORCE_INLINE uint32_t getblock(const uint32_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint32_t)c[0] |
+           (uint32_t)c[1] << 8 |
+           (uint32_t)c[2] << 16 |
+           (uint32_t)c[3] << 24;
+  }
 }
 
 FORCE_INLINE uint64_t getblock(const uint64_t* p, int i) {
-  return p[i];
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint64_t)c[0] |
+           (uint64_t)c[1] << 8 |
+           (uint64_t)c[2] << 16 |
+           (uint64_t)c[3] << 24 |
+           (uint64_t)c[4] << 32 |
+           (uint64_t)c[5] << 40 |
+           (uint64_t)c[6] << 48 |
+           (uint64_t)c[7] << 56;
+  }
 }
 
 //-----------------------------------------------------------------------------
@@ -204,13 +226,35 @@ Status MurmurHash3::Compute(OpKernelContext* ctx) const {
     int input_num_bytes = static_cast<int>(input_element_bytes);
     ORT_ENFORCE(input_num_bytes % 4 == 0);
     const auto input_end = input + input_count * input_num_bytes;
-    while (input != input_end) {
-      MurmurHash3_x86_32(input,
-                         input_num_bytes,
-                         seed_,
-                         output);
-      input += input_num_bytes;
-      ++output;
+
+    if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+      while (input != input_end) {
+        MurmurHash3_x86_32(input,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
+    } else {
+      // Big endian platform require byte swapping.
+      auto raw_data = std::make_unique<char[]>(input_num_bytes);
+      char* raw_data_ptr = raw_data.get();
+      while (input != input_end) {
+        memcpy(raw_data_ptr, input, input_num_bytes);
+        char* start_byte = raw_data_ptr;
+        char* end_byte = start_byte + input_num_bytes - 1;
+        for (size_t count = 0; count < static_cast<size_t>(input_num_bytes / 2); ++count) {
+          std::swap(*start_byte++, *end_byte--);
+        }
+
+        MurmurHash3_x86_32(raw_data_ptr,
+                           input_num_bytes,
+                           seed_,
+                           output);
+        input += input_num_bytes;
+        ++output;
+      }
     }
   }
   return Status::OK();
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
index 7e343d85f4048..b28f3758f89b5 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -40,6 +40,13 @@ void Dequantize4BitsKernelReOrder(
   }
   T* output_i = output + out_y * out_cols + out_x;
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::big) {
+    const uint8_t* c = (const uint8_t*)(&quant_value);
+    quant_value = (uint32_t)c[0] |
+                  (uint32_t)c[1] << 8 |
+                  (uint32_t)c[2] << 16 |
+                  (uint32_t)c[3] << 24;
+  }
   const int remain_x = std::min(8, out_cols - out_x);
   const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1));
   for (int i = 0; i < remain_x; i++) {
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index e8086877a9159..4ecd61962d797 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <string>
 #include <filesystem>
 #if defined(__wasm__)
 #include <emscripten.h>
@@ -260,7 +261,89 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo
 
 namespace utils {
 
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param) {
+  tensor_proto.set_raw_data(std::move(param));
+}
+
+void ConvertRawDataInTensorProto(TensorProto* tensor) {
+  size_t element_size = 1;
+  char* bytes = NULL;
+  size_t num_elements = 0;
+  switch (tensor->data_type()) {
+    case TensorProto_DataType_FLOAT:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+
+    case TensorProto_DataType_INT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(int32_t);
+      break;
+
+    case TensorProto_DataType_UINT32:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint32_t);
+      break;
+
+    case TensorProto_DataType_UINT8:
+    case TensorProto_DataType_INT8:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint8_t);
+      break;
+
+    case TensorProto_DataType_UINT16:
+    case TensorProto_DataType_INT16:
+    case TensorProto_DataType_FLOAT16:
+    case TensorProto_DataType_BFLOAT16:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int32_data()->mutable_data());
+      num_elements = tensor->int32_data_size();
+      element_size = sizeof(uint16_t);
+      break;
+
+    case TensorProto_DataType_UINT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_uint64_data()->mutable_data());
+      num_elements = tensor->uint64_data_size();
+      element_size = sizeof(uint64_t);
+      break;
+
+    case TensorProto_DataType_DOUBLE:
+      bytes = reinterpret_cast<char*>(tensor->mutable_double_data()->mutable_data());
+      num_elements = tensor->double_data_size();
+      element_size = sizeof(double);
+      break;
+
+    case TensorProto_DataType_INT64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_int64_data()->mutable_data());
+      num_elements = tensor->int64_data_size();
+      element_size = sizeof(int64_t);
+      break;
+
+    case TensorProto_DataType_COMPLEX64:
+      bytes = reinterpret_cast<char*>(tensor->mutable_float_data()->mutable_data());
+      num_elements = tensor->float_data_size();
+      element_size = sizeof(float);
+      break;
+  }
+  if (tensor->has_raw_data()) {
+    num_elements = (tensor->raw_data().size()) / element_size;
+    bytes = const_cast<char*>(tensor->mutable_raw_data()->c_str());
+  }
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+  return;
+}
+
 #if !defined(ORT_MINIMAL_BUILD)
+
 static Status UnpackTensorWithExternalDataImpl(const ONNX_NAMESPACE::TensorProto& tensor,
                                                const std::filesystem::path& tensor_proto_dir,
                                                size_t expected_num_elements, size_t element_size,
@@ -1159,11 +1242,6 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto
 }
 
 ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) {
-  // Given we are using the raw_data field in the protobuf, this will work only for little-endian format.
-  if constexpr (endian::native != endian::little) {
-    ORT_THROW("Big endian not supported");
-  }
-
   // Set name, dimensions, type, and data of the TensorProto.
   ONNX_NAMESPACE::TensorProto tensor_proto;
 
@@ -1182,7 +1260,7 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std:
       *mutable_string_data->Add() = *f;
     }
   } else {
-    tensor_proto.set_raw_data(tensor.DataRaw(), tensor.SizeInBytes());
+    utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), tensor.SizeInBytes());
   }
 
   return tensor_proto;
@@ -1464,8 +1542,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 
       ORT_RETURN_IF_ERROR(status);
     }
-    dense.set_raw_data(std::move(dense_data_storage));
-
+    utils::SetRawDataInTensorProto(dense, std::move(dense_data_storage));
   } else {
     // No request for std::string
     status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ",
@@ -1510,7 +1587,17 @@ static void SetIndices(gsl::span<int64_t> gathered_indices, std::string& raw_ind
     } else {
       auto* dst = ind_dest + dest_index;
       T v = static_cast<T>(src_index);
-      memcpy(dst, &v, sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        auto src = gsl::make_span<const unsigned char>(static_cast<const unsigned char*>(
+                                                           reinterpret_cast<const unsigned char*>(&v)),
+                                                       sizeof(T));
+        auto dest = gsl::make_span<unsigned char>(static_cast<unsigned char*>(
+                                                      reinterpret_cast<unsigned char*>(dst)),
+                                                  sizeof(T));
+        onnxruntime::utils::SwapByteOrderCopy(sizeof(T), src, dest);
+      } else {
+        memcpy(dst, &v, sizeof(T));
+      }
     }
     ++dest_index;
   }
@@ -1561,7 +1648,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
     }
   } else {
     indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    indices.set_raw_data(std::string());
+    utils::SetRawDataInTensorProto(indices, std::string());
   }
   nnz = gathered_indices.size();
 }
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index a66caf1ace33b..aabfc0487f3e0 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -5,6 +5,7 @@
 
 #include <vector>
 #include <type_traits>
+#include <string>
 #include <filesystem>
 
 #ifndef SHARED_PROVIDER
@@ -19,6 +20,46 @@
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
 
+namespace onnxruntime {
+namespace utils {
+/**
+ * This function is used to convert the endianess of Tensor data.
+ * Mostly, will be used in big endian system to support the model file
+ * generated on little endian system.
+ * @param initializer       given initializer tensor
+ * @returns                 None
+ */
+void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* initializer);
+
+/**
+ * Wrapper function for set_raw_data.
+ * First calls the set_raw_data and then calls ConvertRawDataInTensorProto
+ * under big endian system.
+ * @param tensor_proto given initializer tensor
+ * @param raw_data     source raw_data pointer
+ * @param raw_data_len  length of raw_data
+ * @returns                 None
+ */
+template <typename T1, typename T2>
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_data, T2 raw_data_len) {
+  using namespace ONNX_NAMESPACE;
+  tensor_proto.set_raw_data(raw_data, raw_data_len);
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&tensor_proto);
+  }
+}
+
+/**
+ * Overload Wrapper function for set_raw_data handling string object.
+ * Forward the string object to set_raw_data.
+ * @param tensor_proto given initializer tensor
+ * @param param   string object reference
+ * @returns                 None
+ */
+void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param);
+}  // namespace utils
+}  // namespace onnxruntime
+
 namespace ONNX_NAMESPACE {
 class TensorProto;
 class TensorShapeProto;
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index f73a50db7aaa4..442a0db933d65 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -1199,6 +1199,15 @@ Graph::Graph(const Model& owning_model,
 
     const gsl::not_null<TensorProto*> tensor{graph_proto_->add_initializer()};
     auto status = utils::ConstantNodeProtoToTensorProto(node, model_path, *tensor);
+    if constexpr (endian::native != endian::little) {
+      const AttributeProto& attrib = node.attribute(0);
+      if (attrib.type() == AttributeProto_AttributeType_SPARSE_TENSOR) {
+        const TensorProto& sparse_values = node.attribute(0).sparse_tensor().values();
+        if ((!(sparse_values.has_raw_data())) && tensor->has_raw_data()) {
+          onnxruntime::utils::ConvertRawDataInTensorProto(tensor);
+        }
+      }
+    }
     ORT_ENFORCE(status.IsOK(), status.ToString());
     // Ensure initializers are also graph inputs.
     if (ir_version_ < 4) {
@@ -3716,6 +3725,12 @@ SaveInputsOutputsToOrtFormat(flatbuffers::FlatBufferBuilder& builder, const std:
 
 common::Status Graph::SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
                                       flatbuffers::Offset<fbs::Graph>& fbs_graph) const {
+  if constexpr (endian::native != endian::little) {
+    auto& tens = GetAllInitializedTensors();
+    for (auto& [name, tensor_p] : tens) {
+      utils::ConvertRawDataInTensorProto(const_cast<TensorProto*>(tensor_p));
+    }
+  }
   auto inputs = SaveInputsOutputsToOrtFormat(builder, graph_inputs_including_initializers_);
   auto outputs = SaveInputsOutputsToOrtFormat(builder, graph_outputs_);
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 72eb35c894094..859b7c2f560a4 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -20,8 +20,15 @@ Module Name:
 #include <thread>
 #include <mutex>
 
-#if defined(MLAS_TARGET_POWER) && defined(__linux__)
+#if defined(MLAS_TARGET_POWER) 
+#if defined(__linux__)
 #include <sys/auxv.h>
+#elif defined(_AIX)
+#define POWER_10       0x40000
+#define POWER_10_ANDUP (POWER_10)
+#include <sys/systemcfg.h>
+#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
+#endif
 #endif
 
 #if defined(MLAS_TARGET_ARM64)
@@ -554,6 +561,9 @@ Return Value:
     unsigned long hwcap2 = getauxval(AT_HWCAP2);
 
     bool HasP9Instructions = hwcap2 & PPC_FEATURE2_ARCH_3_00;
+#elif defined(_AIX)
+    bool HasP9Instructions = __power_9_andup();
+#endif // __linux__
     if (HasP9Instructions) {
         this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelVSX;
         this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelVSX;
@@ -562,7 +572,11 @@ Return Value:
 #if defined(POWER10)
 #if (defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12))
+#if defined(__linux__)
     bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
+#elif defined(_AIX)
+    bool HasP10Instructions = (__power_10_andup() && __power_mma_version() == MMA_V31);
+#endif // __linux__
     if (HasP10Instructions) {
         this->GemmFloatKernel = MlasSgemmKernelPOWER10;
         this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
@@ -571,7 +585,6 @@ Return Value:
 #endif
 #endif
 
-#endif // __linux__
 #endif // MLAS_TARGET_POWER
 
 #if defined(MLAS_TARGET_LARCH64)
@@ -676,7 +689,6 @@ MlasPlatformU8S8Overflow(
 }
 
 #endif
-
 thread_local size_t ThreadedBufSize = 0;
 #ifdef _MSC_VER
 thread_local std::unique_ptr<uint8_t, decltype(&_aligned_free)> ThreadedBufHolder(nullptr, &_aligned_free);
diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
index a67be1dbfa710..0f3bc1d579711 100644
--- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
+++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
@@ -874,10 +874,18 @@ MlasQgemmStoreVectorMMA
 {
     size_t RowCount;
     __vector signed int vsum0, vsum1, vsum2, vsum3;
+#if defined(_AIX) && defined(__clang__)
+    __vector signed int columnsum = *reinterpret_cast<const __vector int *>(&ColumnSumBuffer[pos]);
+#else
     __vector signed int columnsum = *reinterpret_cast<const __vector int32_t *>(&ColumnSumBuffer[pos]);
+#endif
     C += VectorCount;
     if (ZeroPointB != nullptr) {
+#if defined(_AIX) && defined(__clang__)
+        __vector signed int zeropoint = *reinterpret_cast<const __vector int *>(&ZeroPointB[pos]);
+#else
         __vector signed int zeropoint = *reinterpret_cast<const __vector int32_t *>(&ZeroPointB[pos]);
+#endif
         if (ZeroMode) {
             for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
                 vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum;
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 75c17a6b5a177..127aea9029b65 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -894,7 +894,7 @@ MlasGemmQuantGetDispatch(
     if (!AIsSigned) {
         GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
     }
-#elif defined(MLAS_TARGET_POWER) && defined(__linux__)  && defined(POWER10) && \
+#elif defined(MLAS_TARGET_POWER) && (defined(__linux__)  || defined(_AIX)) && defined(POWER10) && \
     ((defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
     (defined(__clang__) && (__clang_major__ >= 12)))
     if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) {
diff --git a/onnxruntime/core/mlas/lib/qlmul.cpp b/onnxruntime/core/mlas/lib/qlmul.cpp
index 38818e1190d21..4a6d57db0d211 100644
--- a/onnxruntime/core/mlas/lib/qlmul.cpp
+++ b/onnxruntime/core/mlas/lib/qlmul.cpp
@@ -325,12 +325,20 @@ MlasQLinearMulKernel(
     }
 
     while (N >= 4) {
-        __vector int32_t IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#if defined(_AIX) && defined(__clang__)
+        __vector int IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#else
+        __vector int32_t  IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]};
+#endif
         auto IntegerVector = vec_sub(IntegerAVector, ZeroPointAVector);
         auto ValueAVector = vec_mul(ScaleAVector, vec_ctf(IntegerVector, 0));
 
         if (!IsScalarB) {
-            __vector int32_t IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#if defined(_AIX) && defined(__clang__)
+            __vector int  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#else
+            __vector int32_t  IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]};
+#endif
             IntegerVector = vec_sub(IntegerBVector, ZeroPointBVector);
             ValueBVector = vec_mul(ScaleBVector, vec_ctf(IntegerVector, 0));
         }
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
index db3b9ee656592..ec5cdbc75220a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp
@@ -155,7 +155,7 @@ namespace
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLen16(
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
     const float* QuantBScaleColPtr,
@@ -177,11 +177,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -191,6 +193,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -212,13 +218,11 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         // TODO handling only 16 elements per accumulator at a time here, probably can do better
         {
-            // load A
-            const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
-
             // load B
             const uint8x8_t bv_packed_col0 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x8_t bv_packed_col1 = vld1_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -242,24 +246,55 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
             bv_col0 = vsubq_s8(bv_col0, vdupq_n_s8(bzp_col0));
             bv_col1 = vsubq_s8(bv_col1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(dot00, av_row0, bv_col0);
-            dot01 = vdotq_s32(dot01, av_row0, bv_col1);
-            dot10 = vdotq_s32(dot10, av_row1, bv_col0);
-            dot11 = vdotq_s32(dot11, av_row1, bv_col1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(int32x4_t{}, av_row0, bv_col0);
+                const int32x4_t dot01 = vdotq_s32(int32x4_t{}, av_row0, bv_col1);
+                const int32x4_t dot10 = vdotq_s32(int32x4_t{}, av_row1, bv_col0);
+                const int32x4_t dot11 = vdotq_s32(int32x4_t{}, av_row1, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row3 = vld1q_s8(QuantADataPtrRow3 + 0);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(int32x4_t{}, av_row2, bv_col0);
+                const int32x4_t dot21 = vdotq_s32(int32x4_t{}, av_row2, bv_col1);
+                const int32x4_t dot30 = vdotq_s32(int32x4_t{}, av_row3, bv_col0);
+                const int32x4_t dot31 = vdotq_s32(int32x4_t{}, av_row3, bv_col1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
         }
 
         // increment block pointers
@@ -273,22 +308,30 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
 template <bool HasZeroPoint>
 MLAS_FORCEINLINE void
-SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
+SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16(
     size_t BlkLen,
     const std::byte* QuantARowPtr,
     const std::byte* QuantBDataColPtr,
@@ -312,11 +355,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
     const float* QuantBScalePtr = QuantBScaleColPtr;
     const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr;
 
-    float32x4_t acc00{}, acc01{}, acc10{}, acc11{};
+    float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{};
 
     for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) {
         const std::byte* QuantABlkRow0 = QuantAPtr;
         const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA;
+        const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2;
+        const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3;
 
         const float QuantBScaleCol0 = *QuantBScalePtr;
         const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale);
@@ -326,6 +371,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1;
         const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0;
         const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1;
+        const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0;
+        const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1;
+        const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0;
+        const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1;
 
         // load B zero point
         int8_t bzp_col0;
@@ -347,14 +396,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
 
         const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0);
         const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1);
+        const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2);
+        const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3);
 
         for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; ++sub_blk_idx) {
-            // load A
-            const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
-            const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
-            const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
-            const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
-
             // load B
             const uint8x16_t bv_packed_col0 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr));
             const uint8x16_t bv_packed_col1 = vld1q_u8(reinterpret_cast<const uint8_t*>(QuantBDataPtr) + StrideQuantBData);
@@ -372,28 +417,65 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
             bv_col1_0 = vsubq_s8(bv_col1_0, vdupq_n_s8(bzp_col1));
             bv_col1_1 = vsubq_s8(bv_col1_1, vdupq_n_s8(bzp_col1));
 
-            // quantized dot product
-            int32x4_t dot00{}, dot01{}, dot10{}, dot11{};
-            dot00 = vdotq_s32(vdotq_s32(dot00, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
-            dot01 = vdotq_s32(vdotq_s32(dot01, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
-            dot10 = vdotq_s32(vdotq_s32(dot10, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
-            dot11 = vdotq_s32(vdotq_s32(dot11, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
-
-            // convert to float
-            const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
-            const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
-            const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
-            const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+            // rows 0 and 1 of A
+            {
+                // load A
+                const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0);
+                const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16);
+                const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0);
+                const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16);
+
+                // quantized dot product
+                const int32x4_t dot00 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1);
+                const int32x4_t dot01 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1);
+                const int32x4_t dot10 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1);
+                const int32x4_t dot11 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00);
+                const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01);
+                const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10);
+                const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11);
+
+                // multiply by scale and update accumulator
+                acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
+                acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
+                acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
+                acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            }
 
-            // multiply by scale and update accumulator
-            acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00));
-            acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01));
-            acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10));
-            acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11));
+            // rows 2 and 3 of A
+            {
+                // load A
+                const int8x16_t av_row2_0 = vld1q_s8(QuantADataPtrRow2 + 0);
+                const int8x16_t av_row2_1 = vld1q_s8(QuantADataPtrRow2 + 16);
+                const int8x16_t av_row3_0 = vld1q_s8(QuantADataPtrRow3 + 0);
+                const int8x16_t av_row3_1 = vld1q_s8(QuantADataPtrRow3 + 16);
+
+                // quantized dot product
+                const int32x4_t dot20 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col0_0), av_row2_1, bv_col0_1);
+                const int32x4_t dot21 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col1_0), av_row2_1, bv_col1_1);
+                const int32x4_t dot30 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col0_0), av_row3_1, bv_col0_1);
+                const int32x4_t dot31 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col1_0), av_row3_1, bv_col1_1);
+
+                // convert to float
+                const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20);
+                const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21);
+                const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30);
+                const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31);
+
+                // multiply by scale and update accumulator
+                acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20));
+                acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21));
+                acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30));
+                acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31));
+            }
 
             // increment block data pointers to next sub-block
             QuantADataPtrRow0 += 32;
             QuantADataPtrRow1 += 32;
+            QuantADataPtrRow2 += 32;
+            QuantADataPtrRow3 += 32;
             QuantBDataPtr += 16;
         }
 
@@ -407,16 +489,24 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16(
         }
     }
 
-    SumPtr[0] = vaddvq_f32(acc00);
-    SumPtr[1] = vaddvq_f32(acc01);
-    SumPtr[ldc + 0] = vaddvq_f32(acc10);
-    SumPtr[ldc + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00);
+    SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01);
+    SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10);
+    SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11);
+    SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20);
+    SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21);
+    SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30);
+    SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31);
 
     if (BiasPtr != nullptr) {
-        SumPtr[0] += BiasPtr[0];
-        SumPtr[1] += BiasPtr[1];
-        SumPtr[ldc + 0] += BiasPtr[0];
-        SumPtr[ldc + 1] += BiasPtr[1];
+        SumPtr[ldc * 0 + 0] += BiasPtr[0];
+        SumPtr[ldc * 0 + 1] += BiasPtr[1];
+        SumPtr[ldc * 1 + 0] += BiasPtr[0];
+        SumPtr[ldc * 1 + 1] += BiasPtr[1];
+        SumPtr[ldc * 2 + 0] += BiasPtr[0];
+        SumPtr[ldc * 2 + 1] += BiasPtr[1];
+        SumPtr[ldc * 3 + 0] += BiasPtr[0];
+        SumPtr[ldc * 3 + 1] += BiasPtr[1];
     }
 }
 
@@ -478,8 +568,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv1 = vsubq_s8(bv1, bzp1);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
-        const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
+        const int32x4_t dot1 = vdotq_s32(int32x4_t{}, av1, bv1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -527,7 +617,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16(
         bv0 = vsubq_s8(bv0, bzp0);
 
         // quantized dot product
-        const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0);
+        const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -604,9 +694,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi1 = vsubq_s8(bv_hi1, bzp1);
 
         // quantized dot product
-        int32x4_t dot0{}, dot1{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
-        dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo1, bv_lo1), av_hi1, bv_hi1);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -652,8 +741,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32(
         bv_hi0 = vsubq_s8(bv_hi0, bzp0);
 
         // quantized dot product
-        int32x4_t dot0{};
-        dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0);
+        const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0);
 
         // convert to float
         const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -736,9 +824,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32(
             bv3 = vsubq_s8(bv3, bzp);
 
             // quantized dot product
-            int32x4_t dot0{}, dot1{};
-            dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1);
-            dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3);
+            const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av0, bv0), av1, bv1);
+            const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av2, bv2), av3, bv3);
 
             // convert to float
             const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0);
@@ -834,7 +921,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -845,8 +932,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLen16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLen16<HasZeroPoint>(
                 QuantARowPtr,
                 QuantBDataColPtr,
                 QuantBScaleColPtr,
@@ -871,38 +958,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen16<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -932,6 +1011,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen16(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -964,7 +1051,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -975,8 +1062,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1002,38 +1089,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLen32<HasZeroPoint>(
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1063,6 +1142,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
@@ -1095,7 +1182,7 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
     float* SumRowPtr = C;
 
     size_t m_remaining = CountM;
-    while (m_remaining > 1) {
+    while (m_remaining > 3) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1106,8 +1193,8 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
         size_t n_remaining = CountN;
         while (n_remaining > 1) {
-            // Compute 2x2 tiles of output
-            SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16<HasZeroPoint>(
+            // Compute 4x2 tiles of output
+            SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16<HasZeroPoint>(
                 BlkLen,
                 QuantARowPtr,
                 QuantBDataColPtr,
@@ -1133,40 +1220,31 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
         }
 
         if (n_remaining > 0) {
-            // Compute last 2x1 tile of output
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr,
-                BlockCountK
-            );
-
-            SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
-                BlkLen,
-                QuantARowPtr + StrideQuantA,
-                QuantBDataColPtr,
-                QuantBScaleColPtr,
-                QuantBZeroPointColPtr,
-                BiasPtr,
-                SumPtr + ldc,
-                BlockCountK
-            );
+            // Compute last 4x1 tile of output
+            for (size_t i = 0; i < 4; ++i) {
+                SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32<HasZeroPoint>(
+                    BlkLen,
+                    QuantARowPtr + StrideQuantA * i,
+                    QuantBDataColPtr,
+                    QuantBScaleColPtr,
+                    QuantBZeroPointColPtr,
+                    BiasPtr,
+                    SumPtr + ldc * i,
+                    BlockCountK
+                );
+            }
         }
 
-        // Move to next 2 rows
-        AdvanceRowPtrs<2>(
+        // Move to next 4 rows
+        AdvanceRowPtrs<4>(
             StrideQuantA, ldc,
             QuantARowPtr, SumRowPtr
         );
 
-        m_remaining -= 2;
+        m_remaining -= 4;
     }
 
-    if (m_remaining > 0) {
+    while (m_remaining > 0) {
         const std::byte* QuantBDataColPtr = QuantBData;
         const float* QuantBScaleColPtr = QuantBScale;
         const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
@@ -1197,6 +1275,14 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32(
 
             n_remaining -= 1;
         }
+
+        // Move to next row
+        AdvanceRowPtrs<1>(
+            StrideQuantA, ldc,
+            QuantARowPtr, SumRowPtr
+        );
+
+        m_remaining -= 1;
     }
 }
 
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index b88f2d6a4637e..08066f030a381 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -126,7 +126,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<float>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* q_weight = q_initializer.data<MLFloat16>();
     const MLFloat16* k_weight = k_initializer.data<MLFloat16>();
@@ -138,7 +138,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
     } else {
       MergeWeights<MLFloat16>(q_weight, k_weight, v_weight, result, hidden_size);
     }
-    initializer.set_raw_data(result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   return graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
index 913f3b6811183..86a7a4d6afbf8 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
@@ -188,7 +188,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph,
               "The total count of dims does not match the size of values. ",
               "total_count: ", total_count, " values.size(): ", values.size());
 
-  const_tensor.set_raw_data(values.data(), values.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t));
   return &graph_utils::AddInitializer(graph, const_tensor);
 }
 
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index 9df300d6f4f88..1466de51d0b99 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -82,8 +82,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) {
     shape_constant.set_name(constant_arg_out->Name());
     shape_constant.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
     shape_constant.add_dims(clamped_slice_length);
-    shape_constant.set_raw_data(dim_values.data() + start,
-                                clamped_slice_length * sizeof(int64_t));
+    utils::SetRawDataInTensorProto(shape_constant, dim_values.data() + start, clamped_slice_length * sizeof(int64_t));
     ONNX_NAMESPACE::TensorShapeProto result_shape;
     result_shape.add_dim()->set_dim_value(clamped_slice_length);
     constant_arg_out->SetShape(result_shape);
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
index 7b6f829b7a0a4..e8e395678436e 100644
--- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -465,15 +465,13 @@ static NodeArg* ExtractEmbedding(Graph& graph,
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(float));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(float));
   } else {  // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16
     const MLFloat16* data = old_initializer.data<MLFloat16>();
     if (!CheckEmbeddingData(data, batch_size, element_count)) {
       return nullptr;
     }
-
-    initializer.set_raw_data(data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
+    utils::SetRawDataInTensorProto(initializer, data, gsl::narrow<size_t>(element_count) * sizeof(MLFloat16));
   }
 
   NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer);
diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc
index 2b29473f876c3..46f306b92bed5 100644
--- a/onnxruntime/core/optimizer/nchwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nchwc_transformer.cc
@@ -428,7 +428,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
     nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
     nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-    nchwc_conv_W_tensor_proto.set_raw_data(reordered_filter.data(), reordered_filter.size() * sizeof(float));
+    utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, reordered_filter.data(),
+                                   reordered_filter.size() * sizeof(float));
 
     nchwc_conv_W_tensor_proto.add_dims(nchwc_output_channels);
     nchwc_conv_W_tensor_proto.add_dims(filter_input_channels);
@@ -458,7 +459,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
 
       nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
       nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder"));
-      nchwc_conv_B_tensor_proto.set_raw_data(aligned_bias.data(), gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
+      utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, aligned_bias.data(),
+                                     gsl::narrow<size_t>(nchwc_output_channels) * sizeof(float));
 
       nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels);
 
@@ -883,7 +885,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_W_tensor_proto;
   nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_scale"));
-  nchwc_conv_W_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_W_tensor_proto.add_dims(nchwc_channels);
   nchwc_conv_W_tensor_proto.add_dims(1);
   nchwc_conv_W_tensor_proto.add_dims(1);
@@ -896,7 +899,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) {
   ONNX_NAMESPACE::TensorProto nchwc_conv_B_tensor_proto;
   nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_B"));
-  nchwc_conv_B_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
+  utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, padded_buffer.data(),
+                                 gsl::narrow<size_t>(nchwc_channels) * sizeof(float));
   nchwc_conv_B_tensor_proto.add_dims(nchwc_channels);
 
   auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
index 6f0f38b1de56e..18e462c04dff3 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc
@@ -129,7 +129,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   weights_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8");
   weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims());
-  weights_proto_u8.set_raw_data(w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
+  utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data<int8_t>(), static_cast<size_t>(w_temp.size()));
   input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8);
 
   ONNX_NAMESPACE::TensorProto weight_zp_proto_u8;
@@ -140,7 +140,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) {
   r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8");
   r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims());
-  r_proto_u8.set_raw_data(r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
+  utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data<int8_t>(), static_cast<size_t>(r_temp.size()));
   input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8);
 
   ONNX_NAMESPACE::TensorProto r_zp_proto_u8;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
index 199fbffc9f723..f2033dcbc1b03 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc
@@ -60,7 +60,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) {
   ONNX_NAMESPACE::TensorProto zp_tensor_proto_u8;
   zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
   zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion"));
-  zp_tensor_proto_u8.set_raw_data(&q_zp_value, sizeof(uint8_t));
+  utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t));
   NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8);
 
   auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant");
diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
index 6caa35ea61ed7..1c1341fe5a127 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h
@@ -27,7 +27,7 @@ inline bool Int8TensorProto2Uint8(
   if (nullptr == src) {
     uint8_t zero_val = 128;
     dst.set_name(graph.GenerateNodeArgName("weight_zp_s8_2_u8"));
-    dst.set_raw_data(&zero_val, sizeof(uint8_t));
+    utils::SetRawDataInTensorProto(dst, &zero_val, sizeof(uint8_t));
     return true;
   }
 
@@ -58,7 +58,7 @@ inline bool Int8TensorProto2Uint8(
     p++;
   }
   if (force || should_convert) {
-    dst.set_raw_data(temp.data<int8_t>(), size_t(temp.size()));
+    utils::SetRawDataInTensorProto(dst, temp.data<int8_t>(), size_t(temp.size()));
     return true;
   }
   return false;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index 3d2a81ce7f8cd..3497ea4c85523 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -5,6 +5,7 @@
 
 #include "core/optimizer/qdq_transformer/qdq_util.h"
 #include "core/graph/node_attr_utils.h"
+#include "core/framework/tensorprotoutils.h"
 namespace onnxruntime {
 namespace QDQ {
 
@@ -132,7 +133,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(int8_t));
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(int8_t));
 
     return tensor_proto;
   };
@@ -145,8 +146,7 @@ struct SetOptionalZeroPoint {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8);
-    tensor_proto.set_raw_data(a.data(), sizeof(uint8_t));
-
+    onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(uint8_t));
     return tensor_proto;
   };
   static ONNX_NAMESPACE::TensorProto GetOptionalZeroPointInt8() {
diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc
index 7768a835d5042..7f94e18458be2 100644
--- a/onnxruntime/core/optimizer/reshape_fusion.cc
+++ b/onnxruntime/core/optimizer/reshape_fusion.cc
@@ -435,7 +435,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo
   shape_initializer_proto.set_name(shape_def->Name());
   shape_initializer_proto.add_dims(static_cast<int64_t>(shape_value.size()));
   shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
-  shape_initializer_proto.set_raw_data(shape_value.data(), shape_value.size() * sizeof(int64_t));
+  utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t));
   auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto);
 
   // Safely remove concat parent nodes which have only one output
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
index a54904ff15e1e..5c09e5225ab9c 100644
--- a/onnxruntime/core/optimizer/stft_decomposition.cc
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -45,7 +45,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T
     element_count *= shape[i];
     proto.add_dims(shape[i]);
   }
-  proto.set_raw_data(begin, element_count * sizeof(TDataType));
+  utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType));
   return &graph_utils::AddInitializer(graph, proto);
 }
 
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
index 1f7e54cb807ea..f756d01413eae 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -766,10 +766,10 @@ std::string_view ApiGraph::AddInitializer(api::DataType dtype, const std::vector
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_data_type(gsl::narrow_cast<int32_t>(dtype));
   tensor_proto.set_name(name);
-  tensor_proto.set_raw_data(data.data(), data.size());
   for (int64_t dim : shape) {
     tensor_proto.add_dims(dim);
   }
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size());
 
   const auto& node_arg = graph_utils::AddInitializer(graph_, tensor_proto);
   return node_arg.Name();
diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h
index a9d89f32e91d3..fca8990f14821 100644
--- a/onnxruntime/core/platform/path_lib.h
+++ b/onnxruntime/core/platform/path_lib.h
@@ -281,7 +281,7 @@ void LoopDir(const std::string& dir_name, T func) {
   ORT_TRY {
     struct dirent* dp;
     while ((dp = readdir(dir)) != nullptr) {
-      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent<PATH_CHAR_TYPE>(dir_name, dp->d_name);
+      std::basic_string<PATH_CHAR_TYPE> filename = ConcatPathComponent(dir_name, dp->d_name);
       if (stat(filename.c_str(), &stats) != 0) {
         continue;
       }
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index ec06320438977..04cf5ff6a3329 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
+#if !defined(_AIX)
 #include <sys/syscall.h>
+#endif
 #include <unistd.h>
 
 #include <iostream>
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index f6a61d55a3d63..831c4cf4d08ba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/model_builder.h"
 #include "core/providers/coreml/builders/op_builder_factory.h"
 #include "core/providers/coreml/shape_utils.h"
@@ -14,13 +15,13 @@ namespace coreml {
 class TransposeOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                  const Node& node,
                                                  const logging::Logger& logger) const {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-
   NodeAttrHelper helper(node);
   std::vector<int64_t> perm = helper.Get("perm", std::vector<int64_t>());
   std::vector<int64_t> input_shape;
@@ -33,12 +34,27 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
   }
 
-  *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, "transpose");
+    AddOperationInput(*op, "x", node.InputDefs()[0]->Name());
+    AddOperationInput(*op, "perm", model_builder.AddConstant(op->type(), "perm", perm));
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+    model_builder.AddOperation(std::move(op));
 
-  *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+  } else
+#endif  // defined(COREML_ENABLE_MLPROGRAM)
+  {
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
 
-  model_builder.AddLayer(std::move(layer));
+    *layer->mutable_input()->Add() = node.InputDefs()[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+
+    model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.cc
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.cc
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/core/providers/cuda/tensor/grid_sample.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample.h
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h
similarity index 100%
rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.h
rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 1a3cc5979ff5a..8c1dce0d3dc1a 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -126,7 +126,7 @@ static std::string config_to_json_str(const onnxruntime::ProviderOptions& config
 
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) {
-  auto model_path = PathToUTF8String(ToPathString(graph_viewer.ModelPath().string()));
+  auto model_path = graph_viewer.ModelPath().string();
   if (s_library_vitisaiep.compile_onnx_model_with_options) {
     return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options));
   } else {
@@ -227,9 +227,9 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     auto& logger = logging::LoggingManager::DefaultLogger();
     auto& model = const_cast<onnxruntime::Model&>(const_model);
     auto model_proto = model.ToProto();
-    auto file_path = ToPathString(model.MainGraph().ModelPath().string());
+    auto file_path = model.MainGraph().ModelPath();
     auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()};
-    auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger);
+    auto ret = Model::Create(std::move(*model_proto), ToPathString(file_path), &local_registries, logger);
     auto status = ret->MainGraph().Resolve();
     vai_assert(status.IsOK(), status.ErrorMessage());
     return ret.release();
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 40b396fda6135..3f46fbde8c714 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -107,12 +107,11 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   auto graph_proto_subgraph = graph.ToGraphProto();
   *model_proto->mutable_graph() = *graph_proto_subgraph;
   auto& logger = logging::LoggingManager::DefaultLogger();
-  auto filename_data_relative_path = std::filesystem::path();
   auto model = Model::Create(std::move(*model_proto), ToPathString(filename), nullptr, logger);
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(filename_dat, graph.ModelPath(), initializer_size_threshold);
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
@@ -124,7 +123,7 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
       *prop->mutable_value() = m.second;
     }
   }
-  std::fstream output(filename, std::ios::out | std::ios::trunc | std::ios::binary);
+  std::fstream output(ToPathString(filename), std::ios::out | std::ios::trunc | std::ios::binary);
   bool result = model_proto->SerializeToOstream(output);
   output << std::flush;
   vai_assert(result, "model serialize to ostream error");
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 3ef6490a56ded..f0eed91d70440 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -881,8 +881,6 @@ common::Status InferenceSession::RegisterGraphTransformer(
 }
 
 common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& filepath) const {
-  ORT_RETURN_IF_NOT(FLATBUFFERS_LITTLEENDIAN, "ort format only supports little-endian machines");
-
   // Get the byte size of the ModelProto and round it to the next MB and use it as flatbuffers' init_size
   // TODO: Investigate whether we should set a max size, and clarify the cost of having a buffer smaller than
   // what the total flatbuffers serialized size will be.
@@ -1390,8 +1388,6 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len
 }
 
 Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort_format_model_bytes) {
-  static_assert(FLATBUFFERS_LITTLEENDIAN, "ORT format only supports little-endian machines");
-
   std::lock_guard<onnxruntime::OrtMutex> l(session_mutex_);
 
   if (is_model_loaded_) {  // already loaded
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 11a830dc6d7f5..40a4a4d26dc1c 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -18,31 +18,36 @@
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
 from packaging import version
 
-from onnxruntime.capi._pybind_state import quantize_matmul_4bits
+from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits
 
 from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
-from .quant_utils import attribute_to_kwarg
+from .quant_utils import QuantFormat, attribute_to_kwarg
 
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class WeightOnlyQuantConfig:
-    def __init__(self, algorithm):
+    def __init__(self, algorithm, quant_format):
         """This is the Base class for Weight Only Quant Configuration.
 
         Args:
             algorithm:
                 weight only quantize algorithm name.
+            quant_format: QuantFormat{QOperator, QDQ}.
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
         """
         self.algorithm = algorithm
+        self.quant_format = quant_format
 
 
 class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
     def __init__(
         self,
         ratios=None,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
@@ -51,11 +56,18 @@ def __init__(
         Args:
             ratios:
                 percentile of clip. Defaults to {}.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format"
+
         if ratios is None:
             ratios = {}
         super().__init__(
             algorithm="RTN",
+            quant_format=quant_format,
         )
         self.ratios = ratios
 
@@ -69,6 +81,7 @@ def __init__(
         actorder=False,
         mse=False,
         perchannel=True,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for GPTQ algorithm Weight Only Quant Configuration.
@@ -87,9 +100,16 @@ def __init__(
                 whether get scale and zero point with mse error.
             perchannel (bool, optional):
                 whether quantize weight per-channel.
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format"
+
         super().__init__(
             algorithm="GPTQ",
+            quant_format=quant_format,
         )
         self.calibration_data_reader = calibration_data_reader
         self.percdamp = percdamp
@@ -105,6 +125,7 @@ def __init__(
         block_size=128,
         bits=4,
         axis=1,
+        quant_format=QuantFormat.QOperator,
     ):
         """
         This is a class for HQQ algorithm Weight Only Quant Configuration.
@@ -112,14 +133,21 @@ def __init__(
 
         Args:
             block_size (int, optional):
-                channel number in one block to execute a GPTQ quantization iteration.
+                channel number in one block to execute a HQQ quantization iteration.
             bits (int, optional):
                 how many bits to represent weight.
             axis (int, optional):
                 0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
         """
+        assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format"
+
         super().__init__(
             algorithm="HQQ",
+            quant_format=quant_format,
         )
         self.block_size = block_size
         self.bits = bits
@@ -132,8 +160,26 @@ def __init__(
         block_size: int = 128,
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
+        quant_format=QuantFormat.QOperator,
     ):
-        super().__init__(algorithm="DEFAULT")
+        """
+        This is a class for weight only affine quantization configuration.
+
+        Args:
+            block_size (int, optional):
+                channel number in one block to execute an affine quantization iteration.
+            is_symmetric (bool, optional):
+                whether quantize weight symmetrically.
+            accuracy_level (int, optional):
+                Accuracy level of the 4-bit quantized MatMul computation.
+                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
+                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
+            quant_format (QuantFormat{QOperator, QDQ}, optional):
+                QOperator format quantizes the model with quantized operators directly.
+                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
+                Defaults to QuantFormat.QOperator.
+        """
+        super().__init__(algorithm="DEFAULT", quant_format=quant_format)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.bits = 4
@@ -287,23 +333,26 @@ def quantize_internal(
 
         return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
         import torch
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        b_pb, bs_graph = get_initializer(inputB, graph_stack)
+        input_b = node.input[1]
+        b_pb, bs_graph = get_initializer(input_b, graph_stack)
         if b_pb is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
         b_array = onnx.numpy_helper.to_array(b_pb)
         if len(b_array.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
+            return [node]  # can only process 2-D matrix
         b_array_torch = torch.from_numpy(b_array)
         if torch.cuda.is_available():
             b_array_torch = b_array_torch.cuda()
@@ -334,7 +383,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
         b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy())
         b_quant.name = b_pb.name + "_Q4"
         for input in bs_graph.input:
-            if input.name == inputB:
+            if input.name == input_b:
                 bs_graph.input.remove(input)
                 break
 
@@ -366,7 +415,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]):
 
         logger.info(f"complete quantization of {node.name} ...")
 
-        return matmul_q4_node
+        return [matmul_q4_node]
 
 
 def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -382,7 +431,7 @@ class DefaultWeightOnlyQuantizer:
     def __init__(self, config: DefaultWeightOnlyQuantConfig):
         self.config = config
 
-    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
+    def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """4b quantize fp32 weight to a blob"""
 
         if len(fp32weight.shape) != 2:
@@ -390,83 +439,136 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         rows, cols = fp32weight.shape
 
         block_size = self.config.block_size
-        blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
-        padded_rows = k_blocks * block_size
-        pad_len = padded_rows - rows
-        if pad_len > 0:
-            fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
 
-        # block wise quantization, each block comes from a single column
-        packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
-        scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-        zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
-        quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric)
+        if self.config.quant_format == QuantFormat.QOperator:
+            blob_size = block_size // 2
+            padded_rows = k_blocks * block_size
+            pad_len = padded_rows - rows
+            if pad_len > 0:
+                fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant")
+
+            # block wise quantization, each block comes from a single column
+            packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
+            zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
+            scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
+            quantize_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
+        else:
+            packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
+            zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
+            scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype)
+            quantize_qdq_matmul_4bits(
+                packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+            )
 
         return (packed, scales, zero_point)
 
-    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
-        """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
+    def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]:
+        """
+        If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node.
+        If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul.
+        """
 
         if node.op_type != "MatMul":
-            return node  # only care about MatMul for now
+            return [node]  # only care about MatMul for now
 
         logger.info(f"start to quantize {node.name} ...")
-        inputB = node.input[1]  # noqa: N806
-        B, Bs_graph = get_initializer(inputB, graph_stack)  # noqa: N806
-        if B is None:
+        qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4
+        input_b = node.input[1]
+        b_tensor, b_graph = get_initializer(input_b, graph_stack)
+        if b_tensor is None:
             logger.info("MatMul doesn't have const weight. Skip to quantize")
-            return node  # only care about constant weight
+            return [node]  # only care about constant weight
 
-        B_array = onnx.numpy_helper.to_array(B)  # noqa: N806
-        if len(B_array.shape) != 2:
+        b_ndarray = onnx.numpy_helper.to_array(b_tensor)
+        if len(b_ndarray.shape) != 2:
             logger.info("MatMul weight is not 2D. Skip to quantize")
-            return node  # can only process 2-D matrix
-
-        packed, scales, zero_points = self.int4_block_quant(B_array)
-        B_quant = onnx.numpy_helper.from_array(packed)  # noqa: N806
-        B_quant.name = B.name + "_Q4"
-        for input in Bs_graph.input:
-            if input.name == inputB:
-                Bs_graph.input.remove(input)
-                break
+            return [node]  # can only process 2-D matrix
 
-        scales_tensor = onnx.numpy_helper.from_array(scales)
-        scales_tensor.name = B.name + "_scales"
-        Bs_graph.initializer.extend([B_quant, scales_tensor])
+        packed, scales, zero_points = self.int4_block_quant(b_ndarray)
 
-        input_names = [node.input[0], B_quant.name, scales_tensor.name]
-        if not self.config.is_symmetric:
-            zp_tensor = onnx.numpy_helper.from_array(zero_points)
-            zp_tensor.name = B.name + "_zero_points"
-            Bs_graph.initializer.extend([zp_tensor])
-            input_names.append(zp_tensor.name)
+        if self.config.quant_format == QuantFormat.QOperator:
+            b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4")
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales")
+        else:
+            b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True)
+            scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales")
 
-        kwargs = {}
-        rows, cols = B_array.shape
-        kwargs["K"] = rows
-        kwargs["N"] = cols
-        kwargs["bits"] = 4
-        kwargs["block_size"] = self.config.block_size
-        if self.config.accuracy_level is not None:
-            kwargs["accuracy_level"] = self.config.accuracy_level
+        for input in b_graph.input:
+            if input.name == input_b:
+                b_graph.input.remove(input)
+                break
 
-        matmul_q4_node = onnx.helper.make_node(
-            "MatMulNBits",
-            inputs=input_names,
-            outputs=[node.output[0]],
-            name=node.name + "_Q4" if node.name else "",
-            domain="com.microsoft",
-            **kwargs,
-        )
+        b_graph.initializer.extend([b_quant, scales_tensor])
+
+        output_nodes = []
+
+        if self.config.quant_format == QuantFormat.QOperator:
+            input_names = [node.input[0], b_quant.name, scales_tensor.name]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points")
+                input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            kwargs = {}
+            rows, cols = b_ndarray.shape
+            kwargs["K"] = rows
+            kwargs["N"] = cols
+            kwargs["bits"] = 4
+            kwargs["block_size"] = self.config.block_size
+            if self.config.accuracy_level is not None:
+                kwargs["accuracy_level"] = self.config.accuracy_level
+
+            matmul_q4_node = onnx.helper.make_node(
+                "MatMulNBits",
+                inputs=input_names,
+                outputs=[node.output[0]],
+                name=node.name + "_Q4" if node.name else "",
+                domain="com.microsoft",
+                **kwargs,
+            )
 
-        logger.info(f"complete quantization of {node.name} ...")
+            output_nodes.append(matmul_q4_node)
+        else:
+            dq_input_names = [b_quant.name, scales_tensor.name]
+            dq_output_names = [b_quant.name + "_output"]
+            matmul_input_names = [node.input[0], dq_output_names[0]]
+            matmul_output_names = [node.output[0]]
+            if not self.config.is_symmetric:
+                zp_tensor = onnx.helper.make_tensor(
+                    b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True
+                )
+                dq_input_names.append(zp_tensor.name)
+                b_graph.initializer.extend([zp_tensor])
+            dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
+            dq_node = onnx.helper.make_node(
+                "DequantizeLinear",
+                inputs=dq_input_names,
+                outputs=dq_output_names,
+                name=node.name + "_DQ_Q4" if node.name else "",
+                **dq_kwargs,
+            )
+            matmul_node = onnx.helper.make_node(
+                "MatMul",
+                inputs=matmul_input_names,
+                outputs=matmul_output_names,
+                name=node.name + "_matmul_Q4" if node.name else "",
+            )
+            output_nodes.extend([dq_node, matmul_node])
 
-        return matmul_q4_node
+        logger.info(f"complete quantization of {node.name} ...")
+        return output_nodes
 
 
 class MatMul4BitsQuantizer:
-    """Perform 4b quantization of constant MatMul weights"""
+    """
+    Perform 4b quantization of constant MatMul weights.
+    If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the
+    MatMul node.
+    If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is
+    replaced by the DequantizeLinear + MatMul nodes.
+    """
 
     def __init__(
         self,
@@ -475,7 +577,8 @@ def __init__(
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
         nodes_to_exclude=None,
-        algo_config: WeightOnlyQuantConfig = None,
+        quant_format=QuantFormat.QOperator,
+        algo_config: WeightOnlyQuantConfig | None = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
@@ -488,7 +591,10 @@ def __init__(
         self.node_quantizer = None
         if algo_config is None:
             algo_config = DefaultWeightOnlyQuantConfig(
-                block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level
+                block_size=block_size,
+                is_symmetric=is_symmetric,
+                accuracy_level=accuracy_level,
+                quant_format=quant_format,
             )
         self.algo_config = algo_config
         if algo_config.algorithm == "HQQ":
@@ -526,15 +632,15 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
                 node = onnx.helper.make_node(  # noqa: PLW2901
                     node.op_type, node.input, node.output, name=node.name, **kwargs
                 )
-            out_node = None
+            out_nodes = []
             if node.name in self.nodes_to_exclude:
                 logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...")
-                out_node = node
+                out_nodes = [node]
             elif self.algo_config is not None and self.algo_config.algorithm == "HQQ":
-                out_node = self.node_quantizer.quantize(node, graph_stack)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
             else:
-                out_node = self.node_quantizer.quantize(node, graph_stack)
-            new_nodes.append(out_node)
+                out_nodes = self.node_quantizer.quantize(node, graph_stack)
+            new_nodes.extend(out_nodes)
 
         graph.ClearField("node")
         graph.node.extend(new_nodes)
@@ -688,6 +794,15 @@ def parse_args():
         default=[],
         help="Specify the nodes to be excluded from quantization with node names",
     )
+    parser.add_argument(
+        "--quant_format",
+        default="QOperator",
+        type=QuantFormat,
+        choices=list(QuantFormat),
+        help="QuantFormat {QOperator, QDQ}"
+        "QOperator format quantizes the model with quantized operators directly."
+        "QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.",
+    )
 
     return parser.parse_args()
 
@@ -699,6 +814,7 @@ def parse_args():
 
     input_model_path = args.input_model
     output_model_path = args.output_model
+    quant_format = args.quant_format
 
     if os.path.exists(output_model_path):
         logger.error(f"file {output_model_path} already exists")
@@ -713,7 +829,10 @@ def parse_args():
         quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits)
     elif args.quant_method == "default":
         quant_config = DefaultWeightOnlyQuantConfig(
-            block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level
+            block_size=args.block_size,
+            is_symmetric=args.symmetric,
+            accuracy_level=args.accuracy_level,
+            quant_format=quant_format,
         )
     elif args.quant_method == "rtn":
         quant_config = RTNWeightOnlyQuantConfig()
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 9ee8f27df5c99..2f335009b59c6 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,12 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4",
-    "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5",
     "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
     "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0",
-    "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0",
+    "10.2.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.2.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index 6fba98c14e792..cd8a8756d681e 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -27,8 +27,6 @@ Please note the package versions needed for using LLaMA-2 in the `requirements.t
   - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file.
 - `requirements-quant.txt`
   - For running the SmoothQuant algorithm using [Intel's Neural Compressor](https://github.com/intel/neural-compressor)
-- `requirements-70b-model.txt`
-  - For running the LLaMA-2 70B model on multiple GPUs
 - `requirements.txt`
   - Package versions needed in each of the above files
 
@@ -221,18 +219,6 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l
 $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu --use_gqa
 ```
 
-Export LLaMA-2 70B sharded model into 4 partitions
-```
-# From source:
-# 1. Install necessary packages from requirements-70b-model.txt
-$ pip install -r requirements-70b-model.txt
-
-# 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command:
-$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/
-
-# 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command:
-$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
-```
 
 ## Parity Checking LLaMA-2
 
@@ -395,18 +381,6 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --device cuda
 ```
 
-9. ONNX Runtime, FP16, convert_to_onnx, LLaMA-2 70B shard to 4 GPUs
-```
-CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
-    --benchmark-type ort-convert-to-onnx \
-    --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
-    --model-name meta-llama/Llama-2-70b-hf \
-    --cache-dir ./model_cache \
-    --precision fp16 \
-    --device cuda \
-    --warmup-runs 5 \
-    --num-runs 100
-```
 
 You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination.
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
deleted file mode 100644
index 38f1916456658..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python benchmark.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
deleted file mode 100644
index 637d15c10e0c7..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-NUM_GPUS=${1:-1}
-
-MPI="mpirun --allow-run-as-root
-    -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0
-    --tag-output --npernode $NUM_GPUS --bind-to numa
-    -x MIOPEN_FIND_MODE=1"
-
-CMD="$MPI python convert_to_onnx.py ${@:2}"
-
-$CMD
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
deleted file mode 100644
index 572cfdb71be4a..0000000000000
--- a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt
+++ /dev/null
@@ -1,4 +0,0 @@
--r requirements.txt
-git+https://github.com/frankdongms/transformers.git@frdong/shard_llama
-mpi4py
-psutil
\ No newline at end of file
diff --git a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
index 32f2da806be3b..467c5e773589a 100644
--- a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
+++ b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc
@@ -12,7 +12,6 @@
 #include "core/graph/graph_flatbuffers_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
-
 #include "test/flatbuffers/flatbuffers_utils_test.fbs.h"
 
 #include "test/util/include/asserts.h"
@@ -116,6 +115,10 @@ ONNX_NAMESPACE::TensorProto CreateInitializer(const std::string& name,
       ORT_THROW("Unsupported data type: ", data_type);
   }
 
+  if constexpr (endian::native != endian::little) {
+    utils::ConvertRawDataInTensorProto(&tp);
+  }
+
   return tp;
 }
 
@@ -258,6 +261,9 @@ TEST(FlatbufferUtilsTest, ExternalWriteReadWithLoadInitializers) {
   for (const auto* fbs_tensor : *fbs_tensors2) {
     ONNX_NAMESPACE::TensorProto initializer;
     ASSERT_STATUS_OK(LoadInitializerOrtFormat(*fbs_tensor, initializer, options, reader));
+    if constexpr (endian::native != endian::little) {
+      utils::ConvertRawDataInTensorProto(&initializer);
+    }
     loaded_initializers.emplace_back(std::move(initializer));
     // also check that the loaded flatbuffer tensors have accurately written to the external_data_offset field
     if (fbs_tensor->data_type() != fbs::TensorDataType::STRING && fbs_tensor->name()->str() != "tensor_32_small") {
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index fa42bb6e96cd5..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -705,6 +705,9 @@ struct InsertIndices {
       // Conversion on the fly to the target data type
       std::vector<T> indices(indices_data.cbegin(), indices_data.cend());
       indices_tp.mutable_raw_data()->assign(reinterpret_cast<const char*>(indices.data()), indices.size() * sizeof(T));
+      if constexpr (endian::native != endian::little) {
+        utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&indices_tp);
+      }
     }
   }
 };
@@ -837,7 +840,7 @@ static void TestConversion(
 template <typename T>
 static void RawDataWriter(const std::vector<T>& values, TensorProto& tp, TensorProto_DataType datatype) {
   tp.set_data_type(datatype);
-  tp.set_raw_data(values.data(), values.size() * sizeof(T));
+  utils::SetRawDataInTensorProto(tp, values.data(), values.size() * sizeof(T));
 }
 
 int64_t ActualSize(const TensorProto& actual) {
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 05bdb3a9a033d..6821f582ce2de 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -30,7 +30,7 @@ void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::pat
   for (int i = 0; i < 4; ++i) {
     memcpy(rawdata + i * sizeof(T), &(f[i]), sizeof(T));
   }
-  float_tensor_proto.set_raw_data(rawdata, len);
+  utils::SetRawDataInTensorProto(float_tensor_proto, rawdata, len);
   T float_data2[4];
   auto status = UnpackTensor(float_tensor_proto, model_path, float_data2, 4);
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
@@ -102,8 +102,25 @@ std::vector<BFloat16> CreateValues<BFloat16>() {
   return {BFloat16(0.f), BFloat16(1.f), BFloat16(2.f), BFloat16(3.f)};
 }
 
+template <typename T>
+void ConvertEndianessForVector(const std::vector<T>& test_data) {
+  const size_t element_size = sizeof(T);
+  const size_t num_elements = test_data.size();
+  char* bytes = reinterpret_cast<char*>(const_cast<T*>(test_data.data()));
+  for (size_t i = 0; i < num_elements; ++i) {
+    char* start_byte = bytes + i * element_size;
+    char* end_byte = start_byte + element_size - 1;
+    for (size_t count = 0; count < element_size / 2; ++count) {
+      std::swap(*start_byte++, *end_byte--);
+    }
+  }
+}
+
 template <typename T>
 void WriteDataToFile(FILE* fp, const std::vector<T>& test_data) {
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(test_data);
+  }
   size_t size_in_bytes = test_data.size() * sizeof(T);
   ASSERT_EQ(size_in_bytes, fwrite(test_data.data(), 1, size_in_bytes, fp));
 }
@@ -147,6 +164,9 @@ void UnpackAndValidate(const TensorProto& tensor_proto, const std::filesystem::p
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tensor_proto, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
 
   // Validate data
   for (size_t i = 0; i < test_data.size(); i++) {
@@ -325,6 +345,9 @@ static void TestConstantNodeConversionWithExternalData(TensorProto_DataType type
   std::vector<T> val(test_data.size());
   auto st = utils::UnpackTensor(tp, model_path, val.data(), test_data.size());
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
+  if constexpr (endian::native != endian::little) {
+    ConvertEndianessForVector(val);
+  }
   for (size_t i = 0; i < test_data.size(); i++) {
     ASSERT_EQ(val[i], test_data[i]);
   }
diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc
index 17edad73085c9..73bf351b6c556 100644
--- a/onnxruntime/test/framework/test_tensor_loader.cc
+++ b/onnxruntime/test/framework/test_tensor_loader.cc
@@ -104,6 +104,18 @@ static void run_external_data_test() {
   std::unique_ptr<ORTCHAR_T, decltype(&DeleteFileFromDisk)> file_deleter(const_cast<ORTCHAR_T*>(filename.c_str()),
                                                                          DeleteFileFromDisk);
   float test_data[] = {1.0f, 2.2f, 3.5f};
+  if constexpr (endian::native != endian::little) {
+    const int element_size = sizeof(float);
+    char* bytes = reinterpret_cast<char*>(test_data);
+    const size_t num_elements = std::size(test_data);
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
+  }
   ASSERT_EQ(sizeof(test_data), fwrite(test_data, 1, sizeof(test_data), fp));
   ASSERT_EQ(0, fclose(fp));
   // construct a tensor proto
@@ -128,8 +140,12 @@ static void run_external_data_test() {
     len = GetCurrentDirectoryW(len, (ORTCHAR_T*)cwd.data());
     ASSERT_NE(len, (DWORD)0);
     cwd.append(ORT_TSTR("\\fake.onnx"));
+#else
+#if defined(_AIX)
+    char* p = getcwd(nullptr, PATH_MAX);
 #else
     char* p = getcwd(nullptr, 0);
+#endif
     ASSERT_NE(p, nullptr);
     cwd = p;
     free(p);
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index fc29756a1ff98..9886d98dcc6d6 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -8,6 +8,8 @@
 #include <unordered_map>
 #ifdef _WIN32
 #include "getopt.h"
+#elif defined(_AIX)
+#include <thread>
 #else
 #include <getopt.h>
 #include <thread>
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 5df055f862a86..50ab2290c6456 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <algorithm>
 #include <limits>
+#include <utility>
 
 #include "mem_buffer.h"
 #include "core/common/safeint.h"
@@ -68,11 +69,22 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
     ORT_CXX_API_THROW(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ",
                                  expected_size_in_bytes, ", got ", raw_data_length),
                       OrtErrorCode::ORT_FAIL);
+  memcpy(p_data, raw_data, raw_data_length);
   if constexpr (endian::native != endian::little) {
-    ORT_CXX_API_THROW("UnpackTensorWithRawData only handles little-endian native byte order for now.",
-                      OrtErrorCode::ORT_NOT_IMPLEMENTED);
+    /* Convert Endianness */
+    char* bytes = reinterpret_cast<char*>(p_data);
+    size_t element_size = sizeof(T);
+    size_t num_elements = raw_data_length / element_size;
+
+    for (size_t i = 0; i < num_elements; ++i) {
+      char* start_byte = bytes + i * element_size;
+      char* end_byte = start_byte + element_size - 1;
+      /* keep swapping */
+      for (size_t count = 0; count < element_size / 2; ++count) {
+        std::swap(*start_byte++, *end_byte--);
+      }
+    }
   }
-  memcpy(p_data, raw_data, raw_data_length);
 }
 
 template <>
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 2bfa57a2ceb9e..3e4e845440117 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -4972,8 +4972,8 @@ TEST_F(GraphTransformationTests, CseWithConstantOfShape) {
     TensorProto value_tensor;
     value_tensor.add_dims(1);
     float value = 2.333f;
-    value_tensor.set_raw_data(reinterpret_cast<const char*>(&value), sizeof(float));
     value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    utils::SetRawDataInTensorProto(value_tensor, reinterpret_cast<const char*>(&value), sizeof(float));
     builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor);
     builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor);
     builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1});
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
index 73c8b3f119103..2cbfbbb317642 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
@@ -61,7 +61,7 @@ NodeArg* ModelTestBuilder::MakeInitializer(gsl::span<const int64_t> shape,
   ONNX_NAMESPACE::TensorProto tensor_proto;
   tensor_proto.set_name(name);
   tensor_proto.set_data_type(elem_type);
-  tensor_proto.set_raw_data(raw_data.data(), raw_data.size());
+  utils::SetRawDataInTensorProto(tensor_proto, raw_data.data(), raw_data.size());
 
   for (auto& dim : shape) {
     tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 0282d09f340b2..6214094a26c4f 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -13,6 +13,7 @@
 #include "core/framework/int4.h"
 #include "core/optimizer/graph_transformer_level.h"
 #include "core/graph/onnx_protobuf.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/framework/test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
@@ -249,7 +250,7 @@ class ModelTestBuilder {
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<bool>());
     std::unique_ptr<bool[]> data_buffer = std::make_unique<bool[]>(data.size());
     for (size_t i = 0; i < data.size(); ++i) data_buffer[i] = data[i];
-    tensor_proto.set_raw_data(data_buffer.get(), data.size());
+    utils::SetRawDataInTensorProto(tensor_proto, data_buffer.get(), data.size());
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 522e96e762d5a..391942acfca35 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -163,8 +163,8 @@ void TestInitializerRawData() {
   tensor_proto.set_name("OptimizerInitializerTest_RawData");
   tensor_proto.add_dims(3);
   tensor_proto.add_dims(4);
-  tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
 
+  utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
   const Initializer init(tensor_proto, std::filesystem::path());
 
   for (size_t idx = 0; idx < data.size(); idx++) {
diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
index 8e4edc9e0abbb..538f60040418c 100644
--- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
@@ -6,6 +6,7 @@
 #include "core/mlas/inc/mlas.h"
 #include "core/session/environment.h"
 #include "core/session/inference_session.h"
+#include "core/framework/tensorprotoutils.h"
 #include "test/compare_ortvalue.h"
 #include "test/test_environment.h"
 #include "test/framework/test_utils.h"
@@ -62,7 +63,7 @@ struct NchwcTestHelper {
     ONNX_NAMESPACE::TensorProto tensor_proto;
     tensor_proto.set_name(name);
     tensor_proto.set_data_type(utils::ToTensorProtoElementType<T>());
-    tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T));
+    utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T));
 
     for (auto& dim : shape) {
       tensor_proto.add_dims(dim);
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 1db8616c85daa..01de15e6f8ec8 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -73,7 +73,7 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) {
       }
     } else {
       auto buffer_size = tensor.DataType()->Size() * shape.Size();
-      tensor_proto.set_raw_data(tensor.DataRaw(), buffer_size);
+      utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size);
     }
 
     // 4. name
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index be049d1cf0ce3..ec9b1614488a7 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -256,7 +256,7 @@ TEST(Random, MultinomialGoodCase) {
   const std::vector<int64_t> output_dims{batch_size, num_samples};
 #ifdef _WIN32
   const std::vector<int64_t> expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int64_t> expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
 #else
   const std::vector<int64_t> expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0};
@@ -294,7 +294,7 @@ TEST(Random, MultinomialDefaultDType) {
 #ifdef _WIN32
   const std::vector<int32_t> expected_output_1{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0};
   const std::vector<int32_t> expected_output_2{0, 0, 1, 0, 2, 2, 2, 0, 2, 1, 2, 1, 0, 2, 0, 2, 2, 1, 2, 1};
-#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__)
+#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX)
   const std::vector<int32_t> expected_output_1{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1};
   const std::vector<int32_t> expected_output_2{1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 2, 2, 2, 1};
 #else
diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
index bd97306142f18..4fc2e6c7c909b 100644
--- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc
@@ -18,13 +18,17 @@ constexpr double DOUBLE_NINF = -std::numeric_limits<double>::infinity();
 constexpr double DOUBLE_NAN = std::numeric_limits<double>::quiet_NaN();
 
 template <typename T>
-void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output) {
+void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list<T>& input, const std::initializer_list<bool>& output, bool skip_trt = false) {
   OpTester test("IsInf", opset);
   test.AddAttribute<int64_t>("detect_positive", detect_positive);
   test.AddAttribute<int64_t>("detect_negative", detect_negative);
   test.AddInput<T>("X", {onnxruntime::narrow<int64_t>(input.size())}, input);
   test.AddOutput<bool>("Y", {onnxruntime::narrow<int64_t>(output.size())}, output);
-  test.Run();
+  if (skip_trt) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  } else {
+    test.Run();
+  }
 }
 
 TEST(IsInfTest, test_isinf_float10) {
@@ -124,7 +128,7 @@ TEST(IsInfTest, test_isinf_bfloat16) {
   std::initializer_list<BFloat16> input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16,
                                            BFloat16::NegativeInfinity, BFloat16::Infinity};
   std::initializer_list<bool> output = {false, false, true, false, true, true};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_isinf_positive_bfloat16) {
@@ -146,7 +150,7 @@ TEST(IsInfTest, test_Float8E4M3FN) {
   std::initializer_list<Float8E4M3FN> input = {
       Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)};
   std::initializer_list<bool> output = {false, false, false, false, false, false};
-  run_is_inf_test(20, 1, 1, input, output);
+  run_is_inf_test(20, 1, 1, input, output, true);  // Skip as TRT10.1 supports Float8 but T4 GPU run on TRT CIs doesn't
 }
 
 TEST(IsInfTest, test_Float8E4M3FNUZ) {
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index fa1c739c04e3a..f96c8ce9ce729 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -13,7 +13,7 @@
  */
 
 #pragma once
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "test/cuda_host/blkq4_fp16_quant_sm80.h"
 
 #include <random>
@@ -197,3 +197,4 @@ void run_blkq4_small_gemm(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index b95e093e41eab..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -10,7 +10,7 @@
  *   This part requires gtest header files, which do not play
  *   well with CUTLASS headers.
  */
-
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include "gtest/gtest.h"
@@ -341,3 +341,4 @@ TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
 
 }  // namespace test
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index f5600ca9885a3..8b27c3d8c3aed 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,6 +11,9 @@
  *   well with gtest headers.
  */
 
+// This test has build error with cuda 12.5
+#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030
+
 #include "blkq4_fp16_gemm_sm80.h"
 
 #include <random>
@@ -532,3 +535,5 @@ template void run_blkq4_small_gemm<128, false, false>(int m, int n, int k);
 }  // namespace test
 }  // namespace cuda
 }  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 88e5052db4e2e..4cc8a0c151d14 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -14,7 +14,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import quant_utils
 
@@ -105,8 +105,9 @@ def make_matmul(
             [output_tensor],
             initializer=initializers,
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
+        # blocked quantization requires DQ op set >= 21
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)])
+        model.ir_version = 10  # use stable onnx ir version
 
         onnx.save(model, output_model_path)
 
@@ -116,9 +117,12 @@ def quant_test(
         data_reader: TestDataFeeds,
         block_size: int,
         is_symmetric: bool,
+        quant_format: quant_utils.QuantFormat = quant_utils.QuantFormat.QOperator,
     ):
+        use_qdq = quant_format == quant_utils.QuantFormat.QDQ
+        name_prefix = "DQ_MatMul" if use_qdq else "MatMulNBits"
         model_int4_path = str(
-            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+            Path(self._tmp_model_dir.name).joinpath(f"{name_prefix}_{block_size}_{is_symmetric}.onnx").absolute()
         )
 
         # Quantize fp32 model to int4 model
@@ -126,15 +130,33 @@ def quant_test(
 
         model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
         quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig(
-            block_size=block_size, is_symmetric=is_symmetric
+            block_size=block_size, is_symmetric=is_symmetric, quant_format=quant_format
         )
         quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config)
         quant.process()
         quant.model.save_model_to_file(model_int4_path, False)
 
-        quant_nodes = {"MatMulNBits": 1}
+        quant_nodes = {"DequantizeLinear": 1, "MatMul": 1} if use_qdq else {"MatMulNBits": 1}
         check_op_type_count(self, model_int4_path, **quant_nodes)
 
+        if use_qdq:
+            dq_qtype = TensorProto.INT4 if is_symmetric else TensorProto.UINT4
+            dqnode_io_qtypes = (
+                {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                    ]
+                }
+                if is_symmetric
+                else {
+                    "DequantizeLinear": [
+                        ["i", 0, dq_qtype],
+                        ["i", 2, dq_qtype],
+                    ]
+                }
+            )
+            check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes)
+
         data_reader.rewind()
 
         try:
@@ -211,6 +233,26 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_symmetric_qdq(self):
+        np.random.seed(13)
+
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=True)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, True, quant_utils.QuantFormat.QDQ)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_offsets_qdq(self):
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test(model_fp32_path, data_reader, 32, False, quant_utils.QuantFormat.QDQ)
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
index fe7e39722237f..880f4175e00b7 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py
@@ -35,8 +35,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_prompt_no_buff(
             config,
@@ -45,8 +45,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
     @parameterized.expand(gqa_past_flash_attention_test_cases())
@@ -67,8 +67,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
         parity_check_gqa_past_no_buff(
             config,
@@ -77,8 +77,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle
             rotary=rotary,
             rotary_interleaved=rotary_interleaved,
             packed=packed,
-            rtol=0.002,
-            atol=0.002,
+            rtol=0.001,
+            atol=0.005,
         )
 
 
diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
new file mode 100644
index 0000000000000..05fcf08cd3232
--- /dev/null
+++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json
@@ -0,0 +1,57 @@
+{
+    "steps": [
+        {
+            "step": 20,
+            "loss": 2.0136
+        },
+        {
+            "step": 40,
+            "loss": 1.8466
+        },
+        {
+            "step": 60,
+            "loss": 1.7525
+        },
+        {
+            "step": 80,
+            "loss": 1.6682
+        },
+        {
+            "step": 100,
+            "loss": 1.658
+        },
+        {
+            "step": 120,
+            "loss": 1.6749
+        },
+        {
+            "step": 140,
+            "loss": 1.6263
+        },
+        {
+            "step": 160,
+            "loss": 1.6828
+        },
+        {
+            "step": 180,
+            "loss": 1.6145
+        },
+        {
+            "step": 200,
+            "loss": 1.6197
+        },
+        {
+            "step": 220,
+            "loss": 1.6353
+        },
+        {
+            "step": 240,
+            "loss": 1.5266
+        },
+        {
+            "step": 260,
+            "loss": 1.5441
+        }
+    ],
+    "samples_per_second": 34.561
+}
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 5b1cd5319acf1..3b3790ba06599 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -21,3 +21,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Sub||
 |ai.onnx:Sigmoid||
 |ai:onnx:Tanh||
+|ai:onnx:Transpose||
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 41b3c47ba0396..a66828ee5e188 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -43,7 +43,7 @@ variables:
   - name: docker_base_image
     value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
   - name: linux_trt_version
-    value: 10.0.1.6-1.cuda11.8
+    value: 10.2.0.19-1.cuda11.8
   - name: Repository
     value: 'onnxruntimecuda11manylinuxbuild'
 
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 8b386dde7d3a7..700326fe9173c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,7 +83,7 @@ variables:
   value: 11.8
 
 - name: win_trt_home
-  value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+  value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
 - name: win_cuda_home
   value: $(Agent.TempDirectory)\v11.8
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index daf95af438d2b..9fd13b513e5fd 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -68,9 +68,9 @@ variables:
       value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
   - name: win_trt_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4
+      value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5
   - name: win_cuda_home
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: $(Agent.TempDirectory)\v11.8
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 5f63339fb0d00..3f9707ff50519 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -43,9 +43,9 @@ variables:
       value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: 10.0.1.6-1.cuda11.8
+      value: 10.2.0.19-1.cuda11.8
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: 10.0.1.6-1.cuda12.4
+      value: 10.2.0.19-1.cuda12.5
 
 jobs:
 - job: Linux_Build
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f36cd9cfbfca1..6bf6324252fb9 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -36,7 +36,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index b9a5383836447..56e9c73a10a82 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -61,7 +61,7 @@ stages:
           ${{ if eq(parameters.CudaVersion, '12.2') }}:
             DockerBuildArgs: "
             --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04
-            --build-arg TRT_VERSION=10.0.1.6-1+cuda12.4
+            --build-arg TRT_VERSION=10.2.0.19-1+cuda12.5
             --build-arg BUILD_UID=$( id -u )
             "
           ${{ else }}:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 001062452644e..0e1afdcc5b8ca 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -25,7 +25,7 @@ variables:
   - name: render
     value: 109
   - name: RocmVersion
-    value: 6.0
+    value: 6.1
   - name: RocmVersionPatchSuffix
     value: ".3"
   - name: BuildConfig
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index f3604dba1ac9d..593d45361324e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -226,7 +226,7 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_trt.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 63e70fa8e6488..d57a7585f3cff 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -55,7 +55,7 @@ stages:
       python_wheel_suffix: '_gpu'
       timeout: 480
       docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-      trt_version: '10.0.1.6-1.cuda11.8'
+      trt_version: '10.2.0.19-1.cuda11.8'
       cuda_version: '11.8'
 
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index b6943f9e1b77b..7dfafeb67acf8 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -49,9 +49,9 @@ jobs:
           value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: 10.0.1.6-1.cuda11.8
+          value: 10.2.0.19-1.cuda11.8
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: 10.0.1.6-1.cuda12.4
+          value: 10.2.0.19-1.cuda12.5
     pool: ${{ parameters.machine_pool }}
     steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index cca53e36ebab9..2ca5129ac6e5d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -80,9 +80,9 @@ stages:
 
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self
       clean: true
@@ -149,9 +149,9 @@ stages:
         value: '12'
     - name: linux_trt_version
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: 10.0.1.6-1.cuda11.8
+        value: 10.2.0.19-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: 10.0.1.6-1.cuda12.4
+        value: 10.2.0.19-1.cuda12.5
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
       submodules: false
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index 01f0337be7714..dcd681bd4b915 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -65,9 +65,9 @@ stages:
           SpecificArtifact: ${{ parameters.SpecificArtifact }}
           BuildId: ${{ parameters.BuildId }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+            EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
@@ -79,7 +79,7 @@ stages:
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
-            trt_version: 10.0.1.6-1.cuda11.8
+            trt_version: 10.2.0.19-1.cuda11.8
           ${{ if eq(parameters.cuda_version, '12.2') }}:
             docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1
-            trt_version: 10.0.1.6-1.cuda12.4
+            trt_version: 10.2.0.19-1.cuda12.5
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index 0dd9ffd5282e7..de29a3de9fded 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.0.1.6'
+    default: '10.2.0.19'
     values:
       - 8.6.1.6
-      - 10.0.1.6
+      - 10.2.0.19
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,9 +42,9 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.0.1.6')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.2.0.19')) }}:
         - powershell: |
-            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.4"
+            Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.5"
           displayName: Set trtCudaVersion
 
     - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index 6c82958fc0b78..63d521f1e7d9a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -24,17 +24,11 @@ steps:
         displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}'
   - ${{ if eq(parameters.DownloadTRT, 'true') }}:
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8'
       - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4" $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4'
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5" $(Agent.TempDirectory)
+        displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5'
       
   - task: BatchScript@1
     displayName: 'setup env'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 97f95797be1f1..6c66cceb33d5c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -22,10 +22,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 3081624225b12..8eca22c8c123f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -18,10 +18,10 @@ parameters:
 
 - name: trt_version
   type: string
-  default: '10.0.1.6-1.cuda11.8'
+  default: '10.2.0.19-1.cuda11.8'
   values:
-    - 10.0.1.6-1.cuda11.8
-    - 10.0.1.6-1.cuda12.4
+    - 10.2.0.19-1.cuda11.8
+    - 10.2.0.19-1.cuda12.5
 - name: cuda_version
   type: string
   default: '11.8'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 3f1c4ef0f8d61..47980955b8798 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -381,7 +381,7 @@ stages:
       variables:
         CUDA_VERSION: '11.8'
         buildArch: x64
-        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
+        EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80"
         EnvSetupScript: setup_env_gpu.bat
         EP_NAME: gpu
         VSGenerator: 'Visual Studio 17 2022'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 9e14789f3b234..27f85dc5c1648 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -288,7 +288,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.8'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -298,7 +298,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.9'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -308,7 +308,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.10'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -318,7 +318,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.11'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -328,7 +328,7 @@ stages:
       parameters:
         MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10'
         PYTHON_VERSION: '3.12'
-        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+        EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
         ENV_SETUP_SCRIPT: setup_env_gpu.bat
         EP_NAME: gpu
         publish_symbols: ${{ parameters.publish_symbols }}
@@ -498,7 +498,7 @@ stages:
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          trt_version: '10.0.1.6-1.cuda11.8'
+          trt_version: '10.2.0.19-1.cuda11.8'
           cuda_version: '11.8'
 
   - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 1af00da01241a..70c0c7d4a04e7 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -55,7 +55,7 @@ jobs:
       WithCache: True
       Today: $(TODAY)
       AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
-      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86'
       MsbuildArguments: $(MsbuildArguments)
       BuildArch: 'x64'
       Platform: 'x64'
@@ -75,7 +75,7 @@ jobs:
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
      set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
 
     workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index 86c178aae519b..2d3dc05285e3c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 4542d3a3f2e4c..a50788e98ffe0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 5ef56fd885ca7..1aca3e305452d 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
index 194a22850030c..5697120a48b2b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.0.1.6-1+cuda11.8
+ARG TRT_VERSION=10.2.0.19-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
deleted file mode 100644
index 8b32425afce1c..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4
+++ /dev/null
@@ -1,63 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
-
-
-# ONNX Runtime Variables
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.4.1-1+cuda11.6" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-WORKDIR /code
-ENV CUDA_MODULE_LOADING "LAZY"
-
-# Prepare onnxruntime repository & build onnxruntime with TensorRT
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
-    cd onnxruntime &&\
-    /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\
-    pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
-    cd .. 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
deleted file mode 100644
index cfc7023ef8e61..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5
+++ /dev/null
@@ -1,92 +0,0 @@
-# --------------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-# --------------------------------------------------------------
-# Dockerfile to run ONNXRuntime with TensorRT integration
-
-# Build base image with required system packages
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
-
-# The local directory into which to build and install CMAKE
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
-
-# Install python3
-RUN apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
-
-RUN pip install --upgrade pip 
-RUN pip install setuptools>=68.2.2
-
-# Install TensorRT
-RUN v="8.5.1-1+cuda11.8" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
-    apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \
-        libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \
-        python3-libnvinfer=${v} libnvinfer-samples=${v}
-
-# Compile trtexec
-RUN cd /usr/src/tensorrt/samples/trtexec && make
-
-# Install Valgrind
-RUN apt-get install -y valgrind
-
-# Build final image from base. Builds ORT.
-FROM base as final
-ARG BUILD_USER=onnxruntimedev
-ARG BUILD_UID=1000
-RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID
-USER $BUILD_USER
-
-# ONNX Runtime arguments
-
-# URL to the github repo from which to clone ORT.
-ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
-
-# The local directory into which to clone ORT.
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-# The git branch of ORT to checkout and build.
-ARG ONNXRUNTIME_BRANCH=main
-
-# Optional. The specific commit to pull and build from. If not set, the latest commit is used.
-ARG ONNXRUNTIME_COMMIT_ID
-
-# The supported CUDA architecture
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}
-
-# Clone ORT repository with branch
-RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
-    /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-
-WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime
-
-# Reset to a specific commit if specified by build args.
-RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\
-    else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\
-    git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi
-
-# Build ORT
-ENV CUDA_MODULE_LOADING "LAZY"
-RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"'
-
-# Switch to root to continue following steps of CI
-USER root
-
-# Intall ORT wheel
-RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
similarity index 99%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index cd168e1911d95..0bd56a1a5873f 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda11.8" &&\
+RUN version="10.2.0.19-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
similarity index 83%
rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 3e48415118c63..7f66943dd8745 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,15 +30,27 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2 psutil
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
-RUN version="10.0.1.6-1+cuda12.4" &&\
+RUN version="10.2.0.19-1+cuda12.5" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
-    tensorrt=${version}
+    libnvinfer-dev=${version} \
+    libnvinfer-dispatch-dev=${version} \
+    libnvinfer-dispatch10=${version} \
+    libnvinfer-headers-dev=${version} \
+    libnvinfer-headers-plugin-dev=${version} \
+    libnvinfer-lean-dev=${version} \
+    libnvinfer-lean10=${version} \
+    libnvinfer-plugin-dev=${version} \
+    libnvinfer-plugin10=${version} \
+    libnvinfer-vc-plugin-dev=${version} \
+    libnvinfer-vc-plugin10=${version} \
+    libnvinfer10=${version} \
+    libnvonnxparsers-dev=${version} \
+    libnvonnxparsers10=${version} \
+    tensorrt-dev=${version} \
+    libnvinfer-bin=${version}
 
 # Compile trtexec if not installed
 RUN if [ ! -d /usr/src/tensorrt/bin ] || [ ! -f /usr/src/tensorrt/bin/trtexec ]; then \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
index a26bf88fbbdf6..0281c1c8fef25 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin
@@ -5,7 +5,7 @@
 # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries
 
 # Build base image with required system packages
-FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base
+FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base
 
 # The local directory into which to build and install CMAKE
 ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
@@ -30,9 +30,6 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip 
 RUN pip install setuptools>=68.2.2
 
-# Install cuDNN v9
-RUN apt-get -y install cudnn9-cuda-12
-
 # Install TensorRT
 # Must provide version numbers used to build the name of the tar file containing TensorRT binaries.
 # See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index 3a7f410d3859e..a0020a9827290 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.0.1.6-1.cuda11.8
+ARG TRT_VERSION=10.2.0.19-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index b94826ae0e4bc..bf21a65314985 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -1,7 +1,7 @@
 # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete
 FROM ubuntu:22.04
 
-ARG ROCM_VERSION=6.0
+ARG ROCM_VERSION=6.1
 ARG AMDGPU_VERSION=${ROCM_VERSION}
 ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600'
 
@@ -77,11 +77,7 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi
 RUN export MAJOR=$(cut -d '.' -f 1 <<< "$ROCM_VERSION") && \
     export MINOR=$(cut -d '.' -f 2 <<< "$ROCM_VERSION") && \
     export PATCH=$(cut -d '.' -f 3 <<< "$ROCM_VERSION") && \
-    if (( MAJOR >= 6 )); then \
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${MAJOR}.${MINOR} ; \
-    else \
-        pip install torch==2.0.1 torchvision==0.15.2 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ ; \
-    fi && \
+    pip install torch==2.1.2 torchvision==0.16.1 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ && \
     pip install torch-ort --no-dependencies
 
 ##### Install Cupy to decrease CPU utilization
diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
index b8c00a610b781..6ece3c1f92c4e 100644
--- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
+++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt
@@ -1,2 +1,2 @@
-azure-kusto-data[pandas]==3.0.1
-azure-kusto-ingest[pandas]==3.0.1
+azure-kusto-data[pandas]==4.5.1
+azure-kusto-ingest[pandas]==4.5.1
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index b753cdae16b90..6c59866ea925a 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 
 @REM The default version is still cuda v11.8, because set cuda v12.2 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 4e43b5999a315..249bb98815897 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY
\ No newline at end of file