diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4a5b87b3e69ed..e4d1b91bab736 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -47,6 +47,14 @@ jobs:
         # Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         queries: security-extended,security-and-quality
 
+    # Setup Java to use a version that is not too old for the project
+    - if: ${{ matrix.language == 'java' }}
+      name: Setup Java 11
+      uses: actions/setup-java@v4
+      with:
+        java-version: '11'
+        distribution: 'microsoft'
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - if: ${{ matrix.language != 'cpp' }}
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index ea90a0eba92f7..103a83a35e40c 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -206,7 +206,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "c11386eb632eec7c1c2aa323142f73519f946e2a",
+          "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc",
           "repositoryUrl": "https://github.com/intel/neural-speed.git"
         },
         "comments": "neural_speed"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 02b568abdf8da..ee1959bb357fe 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -325,8 +325,8 @@ if (onnxruntime_USE_ROCM)
   # replicate strategy used by pytorch to get ROCM_VERSION
   # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
   # with modification
-  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version-dev")
-    file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW)
+  if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
+    file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
     string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
   elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
     file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
@@ -345,7 +345,7 @@ if (onnxruntime_USE_ROCM)
   else()
     message(FATAL_ERROR "Cannot determine ROCm version string")
   endif()
-  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n")
+  message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
   message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
   message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
   message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
@@ -1206,7 +1206,7 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM)
   include(neural_speed)
   if (USE_NEURAL_SPEED)
     list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
@@ -1290,34 +1290,6 @@ if (onnxruntime_USE_OPENVINO)
 
   add_definitions(-DUSE_OPENVINO=1)
 
-  if (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt")
-    file(READ $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt VER)
-  endif()
-
-  if (NOT DEFINED ENV{INTEL_OPENVINO_DIR})
-    message(FATAL_ERROR "[Couldn't locate OpenVINO] OpenVINO may not have been initialized")
-  endif()
-
-  # Check OpenVINO version for support
-  if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
-    set(OPENVINO_VERSION "2023.0")
-    add_definitions(-DOPENVINO_2023_0=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
-    set(OPENVINO_VERSION "2023.1")
-    add_definitions(-DOPENVINO_2023_1=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2")
-    set(OPENVINO_VERSION "2023.2")
-    add_definitions(-DOPENVINO_2023_2=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.3")
-    set(OPENVINO_VERSION "2023.3")
-    add_definitions(-DOPENVINO_2023_3=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
-    set(OPENVINO_VERSION "2023.3")
-    add_definitions(-DOPENVINO_2023_3=1)
-  else()
-    message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
-  endif()
-
   if (onnxruntime_USE_OPENVINO_GPU_FP32)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
   endif()
@@ -1334,6 +1306,10 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
@@ -1354,6 +1330,11 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
   endif()
 
+  if (onnxruntime_USE_OPENVINO_NPU_NP)
+    add_definitions(-DOPENVINO_CONFIG_NPU=1)
+    add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
+  endif()
+
   if (onnxruntime_USE_OPENVINO_HETERO)
     add_definitions(-DOPENVINO_CONFIG_HETERO=1)
     add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index d3f9256105127..9a3bc3302cc2b 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -8,6 +8,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
   string(APPEND CMAKE_ASM_FLAGS_RELEASE " -O3")
 endif()
 
+# Suggested by https://gitlab.kitware.com/cmake/cmake/-/issues/20132
+# MacCatalyst is not well supported in CMake
+# The error that can emerge without this flag can look like:
+# "clang : error : overriding '-mmacosx-version-min=11.0' option with '-target x86_64-apple-ios14.0-macabi' [-Werror,-Woverriding-t-option]"
+if (PLATFORM_NAME STREQUAL "macabi")
+  add_compile_options(-Wno-overriding-t-option)
+  add_link_options(-Wno-overriding-t-option)
+endif()
+
 # Enable space optimization for gcc/clang
 # Cannot use "-ffunction-sections -fdata-sections" if we enable bitcode (iOS)
 if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 27ba21236d064..6e641d657a36f 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -35,7 +35,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
+neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/2159934fd7f07da00849a45be54ceba9de9d6d48.zip;e8c1179d9590fde9de84d55e53e6ccfcbb3c0f97
 #use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
index ed711351403a7..3fe9c660f89d6 100644
--- a/cmake/external/neural_speed.cmake
+++ b/cmake/external/neural_speed.cmake
@@ -9,6 +9,7 @@ if(USE_NEURAL_SPEED)
       neural_speed
       URL ${DEP_URL_neural_speed}
       URL_HASH SHA1=${DEP_SHA1_neural_speed}
+      PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
   )
   set(BTLA_USE_OPENMP OFF)
   onnxruntime_fetchcontent_makeavailable(neural_speed)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index ac1e187f357aa..8839dbc8fda4f 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -14,6 +14,16 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST)
     set(DEP_URL_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP_URL})
     # The third column is SHA1 hash value
     set(DEP_SHA1_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP})
+
+    if(ONNXRUNTIME_DEP_URL MATCHES "^https://")
+      # Search a local mirror folder
+      string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}")
+
+      if(EXISTS "${LOCAL_URL}")
+        cmake_path(ABSOLUTE_PATH LOCAL_URL)
+        set(DEP_URL_${ONNXRUNTIME_DEP_NAME} "${LOCAL_URL}")
+      endif()
+    endif()
   endif()
 endforeach()
 
diff --git a/cmake/maccatalyst_prepare_objects_for_prelink.py b/cmake/maccatalyst_prepare_objects_for_prelink.py
new file mode 100644
index 0000000000000..34664b4e05237
--- /dev/null
+++ b/cmake/maccatalyst_prepare_objects_for_prelink.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+import sys
+
+
+# Note: This script is mainly used for sanity checking/validating the files in the .a library equal to the .o files
+# in the source dir to handle the case of source files having duplicate names under different subdirectories for
+# each onnxruntime library. (Only applicable when doing a Mac Catalyst build.)
+def main():
+    source_dir = sys.argv[1]
+    dest_dir = sys.argv[2]
+    files_from_static_lib = sys.argv[3]
+    files_from_source_dir = []
+    for subdir, _, files in os.walk(source_dir):
+        for file_name in files:
+            if file_name.endswith(".o"):
+                files_from_source_dir.append(file_name.strip())
+                dest_name_without_extension, _ = os.path.splitext(file_name)
+                counter = 0
+
+                dest_file = f"{dest_name_without_extension}.o"
+                while os.path.exists(os.path.join(dest_dir, dest_file)):
+                    print("Duplicate file name from source: " + os.path.join(source_dir, subdir, file_name))
+                    counter += 1
+                    dest_file = f"{dest_name_without_extension}_{counter}.o"
+                    print("Renamed file name in destination: " + os.path.join(dest_dir, dest_file))
+
+                destination_path = os.path.join(dest_dir, dest_file)
+                source_file = os.path.join(source_dir, subdir, file_name)
+                shutil.copy(source_file, destination_path)
+
+    # Sanity check to ensure the number of .o object from the original cmake source directory matches with the number
+    # of .o files extracted from each .a onnxruntime library
+    file_lists_from_static_lib = []
+    with open(files_from_static_lib) as file:
+        filenames = file.readlines()
+    for filename in filenames:
+        file_lists_from_static_lib.append(filename.strip())
+
+    sorted_list1 = sorted(file_lists_from_static_lib)
+    sorted_list2 = sorted(files_from_source_dir)
+
+    if len(sorted_list1) != len(sorted_list2):
+        print(
+            "Caught a mismatch in the number of .o object files from the original cmake source directory: ",
+            len(sorted_list1),
+            "the number of .o files extracted from the static onnxruntime lib: ",
+            len(sorted_list2),
+            "for: ",
+            os.path.basename(source_dir),
+        )
+
+    if sorted_list1 == sorted_list2:
+        print(
+            "Sanity check passed: object files from original source directory matches with files extracted "
+            "from static library for: ",
+            os.path.basename(source_dir),
+        )
+    else:
+        print(
+            "Error: Mismatch between object files from original source directory "
+            "and the .o files extracted from static library for: ",
+            os.path.basename(source_dir),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 2ead13e554197..e15c8a046dc20 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -281,7 +281,13 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  # when building for mac catalyst, the CMAKE_OSX_SYSROOT is set to MacOSX as well, to avoid duplication,
+  # we specify as `-macabi` in the name of the output static apple framework directory.
+  if (PLATFORM_NAME STREQUAL "macabi")
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-macabi)
+  else()
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  endif()
 
   # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
@@ -299,18 +305,34 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used
   # by the ORT .o files.
 
-  # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes
-  # with filenames (e.g. utils.o)
+  # If it's an onnxruntime library, extract .o files from the original cmake build path to a separate directory for
+  # each library to avoid any clashes with filenames (e.g. utils.o)
   foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} )
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
       set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$<TARGET_LINKER_FILE_BASE_NAME:${_LIB}>)
       add_custom_command(TARGET onnxruntime POST_BUILD
                          COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR})
-
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
-                         WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      if (PLATFORM_NAME STREQUAL "macabi")
+        # There exists several duplicate names for source files under different subdirectories within
+        # each onnxruntime library. (e.g. onnxruntime/contrib_ops/cpu/element_wise_ops.o
+        # vs. onnxruntime/providers/core/cpu/math/element_wise_ops.o)
+        # In that case, using 'ar ARGS -x' to extract the .o files from .a lib would possibly cause duplicate naming files being overwritten
+        # and lead to missing undefined symbol error in the generated binary.
+        # So we use the below python script as a sanity check to do a recursive find of all .o files in ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR}
+        # and verifies that matches the content of the .a, and then copy from the source dir.
+        # TODO: The copying action here isn't really necessary. For future fix, consider using the script extracts from the ar with the rename to potentially
+        # make both maccatalyst and other builds do the same thing.
+        set(CUR_TARGET_CMAKE_SOURCE_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_LIB}.dir)
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                          COMMAND ar -t $<TARGET_FILE:${_LIB}> | grep "\.o$"  > ${_LIB}.object_file_list.txt
+                          COMMAND ${CMAKE_COMMAND} -E env python3 ${CMAKE_CURRENT_SOURCE_DIR}/maccatalyst_prepare_objects_for_prelink.py ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} ${CUR_STATIC_LIB_OBJ_DIR} ${CUR_STATIC_LIB_OBJ_DIR}/${_LIB}.object_file_list.txt
+                          WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      else()
+        add_custom_command(TARGET onnxruntime POST_BUILD
+        COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
+        WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
+      endif()
     endif()
   endforeach()
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 17de2aa4aaea6..6b7d4402be8eb 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -631,6 +631,12 @@ if (WIN32)
   endif()
 endif()
 
+if (PLATFORM_NAME STREQUAL "macabi")
+  # Needed for maccatalyst C compilation
+  # i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
+  target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
+endif()
+
 if (NOT onnxruntime_BUILD_SHARED_LIB)
     install(TARGETS onnxruntime_mlas
             ARCHIVE   DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index e26f0bfc0b751..5876b2b5c448b 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -16,23 +16,19 @@
   endif()
 
   # Header paths
-  find_package(InferenceEngine REQUIRED)
-  find_package(ngraph REQUIRED)
-
-  if (OPENVINO_2022_1 OR OPENVINO_2022_2)
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  list (OV_20_LIBS openvino::frontend::onnx openvino::runtime)
+  if(OpenVINO_VERSION VERSION_LESS 2023.0)
+    message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release")
   endif()
 
   if (WIN32)
     unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
   endif()
 
+  list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
   if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}))
     add_definitions(-DIO_BUFFER_ENABLED=1)
-    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS} ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
-  else()
-    list(APPEND OPENVINO_LIB_LIST ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES})
+    list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS})
   endif()
 
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
@@ -75,7 +71,14 @@
     message(FATAL_ERROR "onnxruntime_providers_openvino unknown platform, need to specify shared library exports for it")
   endif()
 
-  install(TARGETS onnxruntime_providers_openvino
-          ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
+  if (CMAKE_OPENVINO_LIBRARY_INSTALL_DIR)
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_OPENVINO_LIBRARY_INSTALL_DIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  else()
+    install(TARGETS onnxruntime_providers_openvino
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+  endif()
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 6f54943f09afe..cadb06bb38707 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -94,30 +94,18 @@ set(contrib_ops_excluded_files
   "bert/group_query_attention.cc"
   "bert/group_query_attention_impl.h"
   "bert/group_query_attention_impl.cu"
+  "collective/distributed_*"
+  "collective/shard*"
 )
 
-if (NOT onnxruntime_ENABLE_ATEN)
-  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
-endif()
 if (NOT onnxruntime_USE_NCCL)
   # Those are string patterns to exclude. Do NOT use stars such as
   # collective/*.cc or *.h.
   list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
-  list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc")
-  list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc")
-else()
-  # moe not supported for ROCm EP
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h")
-  list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc")
+endif()
+
+if (NOT onnxruntime_ENABLE_ATEN)
+  list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc")
 endif()
 
 set(provider_excluded_files
diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
new file mode 100644
index 0000000000000..e503a512a74ff
--- /dev/null
+++ b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch
@@ -0,0 +1,30 @@
+diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
+index 99f3ccc..a11de9d 100644
+--- a/bestla/bestla/bestla_prologue_b.h
++++ b/bestla/bestla/bestla_prologue_b.h
+@@ -456,9 +456,8 @@ class WeightKBlockNInteger {
+     auto tmpscales = tmp;
+     auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+     if (scales) {
+-      for (size_t i = 0; i < N * blks; i += 2) {
++      for (size_t i = 0; i < N * blks; i ++) {
+         tmpscales[i] = scales[i] / 16;
+-        tmpscales[i + 1] = scales[i + 1] / 16;
+       }
+     }
+     if (zero_points) {
+diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h
+index 6783ee8..59822e5 100644
+--- a/bestla/bestla/kernel_avx512f.h
++++ b/bestla/bestla/kernel_avx512f.h
+@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8
+     zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift);  // int3_clip => int8
+     zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift);  // int3_clip => int8
+
+-    _mm512_storeu_epi8((__m512i*)dst, zmm1);
+-    _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2);
++    _mm512_storeu_si512((__m512i*)dst, zmm1);
++    _mm512_storeu_si512((__m512i*)(dst + 64), zmm2);
+   };
+
+   assert(head_ignore_num % 8 == 0);
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index 78d04a51ba162..049916fac92f1 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -1,9 +1,9 @@
 #-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
+# Copyright(C) 2021-2024 Intel Corporation.
 # SPDX-License-Identifier: MIT
 #--------------------------------------------------------------------------
 
-ARG OPENVINO_VERSION=2023.0.0
+ARG OPENVINO_VERSION=2024.0.0
 
 
 # Build stage
@@ -17,7 +17,7 @@ ARG DEVICE=CPU_FP32
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
 ARG ONNXRUNTIME_BRANCH=main
 
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
+ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
 
 USER root
 RUN apt update; apt install -y git protobuf-compiler libprotobuf-dev
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
deleted file mode 100755
index 697db44801e3b..0000000000000
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ /dev/null
@@ -1,105 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-FROM centos:7.8.2003
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH}
-ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:$LD_LIBRARY_PATH
-
-# Install packages
-RUN yum update -y && \
-    yum groupinstall "Development Tools" -y && \
-    yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel boost-devel-1.53.0 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install cmake
-    cd $MY_ROOT && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \
-    tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \
-    cd cmake-3.27.3 && \
-    ./bootstrap && \
-    make && \
-    make install && \
-    cd $MY_ROOT && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    yum-config-manager --add-repo https://yum.repos.intel.com/openvino/2021/setup/intel-openvino-2021.repo && \
-    rpm --import https://yum.repos.intel.com/openvino/2021/setup/RPM-GPG-KEY-INTEL-OPENVINO-2021 && \
-    yum update -y && yum list intel-openvino* && \
-    yum install -y $YUM_OV_PACKAGE && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-# Install GPU runtime and drivers
-    cd ${MY_ROOT} && \
-    mkdir /tmp/opencl && \
-    cd /tmp/opencl && \
-    yum install -y epel-release && \
-    yum install -y ocl-icd ocl-icd-devel && \
-    wget -O intel-igc-core-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-core-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-opencl-19.41.14441-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-opencl-19.41.14441-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-19.3.2-1.el7.x86_64.rpm/download && \
-    wget -O intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm/download && \
-    rpm -i /tmp/opencl/*.rpm  && \
-    ldconfig  && \
-    rm -rf /tmp/opencl && \
-# Installing gcc-10
-    yum install -y centos-release-scl && \
-    yum install -y devtoolset-10-gcc* && \
-    echo 'source scl_source enable devtoolset-10' >> ~/.bashrc && \
-# python installation
-    source scl_source enable devtoolset-10 && \
-    cd /code/ && \
-    wget https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tgz && tar xvf Python-3.8.3.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.3/ /usr/bin/Python38 && \
-# installing dependancies
-    yum install -y python3-lxml python3-six libusb.x86_64 && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    pip3 install onnx && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd  $MY_ROOT && rm -rf onnxruntime Python-3* && \
-    cd ${MY_ROOT}/ && rm -rf cmake* && \
-    cd /usr/share/ && rm -rf gcc* && cd /usr/lib/ && rm -rf gcc cd && rm -rf .cache && \
-    cd ${INTEL_OPENVINO_DIR}/ && rm -rf documentation data_processing && cd deployment_tools/ && rm -rf tools
diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp
deleted file mode 100644
index 2529ef4b73209..0000000000000
--- a/dockerfiles/Dockerfile.openvino-csharp
+++ /dev/null
@@ -1,90 +0,0 @@
-#-------------------------------------------------------------------------
-# Copyright(C) 2021-2023 Intel Corporation.
-# SPDX-License-Identifier: MIT
-#--------------------------------------------------------------------------
-
-ARG OPENVINO_VERSION=2023.0.0
-
-# Build stage
-FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS base
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-USER root
-RUN apt update; apt install -y --no-install-recommends wget gnupg && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Mono
-RUN wget http://download.mono-project.com/repo/xamarin.gpg && apt-key add xamarin.gpg && rm xamarin.gpg && \
-    echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | tee /etc/apt/sources.list.d/mono-official-stable.list && \
-    apt update -y && \
-    apt install -y mono-devel
-
-# Install nuget.exe
-RUN wget https://dist.nuget.org/win-x86-commandline/latest/nuget.exe && \
-    mv nuget.exe /usr/local/bin/nuget.exe && \
-    echo 'mono /usr/local/bin/nuget.exe $@' > /usr/local/bin/nuget && \
-    chmod a+x /usr/local/bin/nuget
-
-# Install .NET core
-RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \
-    dpkg -i packages-microsoft-prod.deb && \
-    apt-get update -y &&\
-    apt-get install -y apt-transport-https && \
-    apt-get update -y && \
-    apt-get install -y dotnet-sdk-5.0
-
-# Build stage
-FROM base AS builder
-
-ENV WORKDIR_PATH=/home/openvino
-WORKDIR $WORKDIR_PATH
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LANG en_US.UTF-8
-
-USER root
-RUN apt update; apt install -y --no-install-recommends git protobuf-compiler libprotobuf-dev ca-certificates unattended-upgrades && \
-    unattended-upgrade && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO}
-RUN /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh
-RUN ln -s cmake-* cmake-dir
-RUN python3 -m pip install wheel
-ENV PATH=${WORKDIR_PATH}/cmake-dir/bin:$PATH
-RUN pip3 install onnx
-RUN ln -s /usr/bin/python3 /usr/bin/python
-RUN apt install locales && \
-    locale-gen en_US en_US.UTF-8 && \
-    dpkg-reconfigure locales
-RUN cd onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib
-RUN cp /home/openvino/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts
-
-# Deploy stage
-FROM base
-
-ENV DEBIAN_FRONTEND noninteractive
-USER root
-
-RUN apt update; apt install -y unattended-upgrades fonts-freefont-ttf && \
-    unattended-upgrade
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-WORKDIR ${WORKDIR_PATH}
-COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts ${WORKDIR_PATH}/nuget-artifacts
-
-USER ${BUILD_USER}
-ENV PATH=${WORKDIR_PATH}/miniconda/bin:${WORKDIR_PATH}/cmake-dir/bin:$PATH
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64
-ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
diff --git a/dockerfiles/Dockerfile.openvino-rhel8 b/dockerfiles/Dockerfile.openvino-rhel8
deleted file mode 100644
index 5c504cfa553a1..0000000000000
--- a/dockerfiles/Dockerfile.openvino-rhel8
+++ /dev/null
@@ -1,87 +0,0 @@
-# Build stage
-FROM registry.access.redhat.com/ubi8/ubi:8.4
-
-WORKDIR /code
-
-ARG MY_ROOT=/code
-ARG DEVICE=CPU_FP32
-ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
-ARG ONNXRUNTIME_BRANCH=main
-
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2022.3.0
-
-ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/
-ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib/:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
-ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake
-ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/extras/opencv/lib:${LD_LIBRARY_PATH}
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${MY_ROOT}/cmake-dir/bin:$PATH
-
-# Install packages
-RUN yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel git make gcc && \
-    yum clean packages &&  yum clean all && rm -rf /var/cache/yum && \
-# Install python 3.8
-    cd $MY_ROOT && \
-    wget https://www.python.org/ftp/python/3.8.9/Python-3.8.9.tgz && tar xvf Python-3.8.9.tgz && rm -rf Python-3.8.9.tgz && \
-    cd Python-3.8*/ && ./configure && make && make install && \
-    cd ../ &&  mkdir -p /usr/bin/Python38 && ln -s Python-3.8.9/ /usr/bin/Python38 && ln -s /usr/bin/pip3 /usr/bin/pip && \
-# libusb1.0.22
-    cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /opt/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /opt/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig' && \
-# Install openvino
-    cd /opt/ && mkdir intel/ && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2022.3/linux/l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz  && \
-    tar xvf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    rm -rf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \
-    mv l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64 openvino_2022.3.0 && \
-    cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && ./install_NEO_OCL_driver.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /opt/libusb-1.0.22 && \
-    /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
-    # MYRIAD plugins are not available for openvino 2022.3.0 release
-    #cp /opt/intel/openvino_2022.3.0/install_dependencies/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
-    ldconfig && \
-#Install protobuf
-    cd $MY_ROOT && \
-    git clone https://github.com/protocolbuffers/protobuf.git && \
-    cd protobuf && \
-    git checkout v3.16.0 && \
-    git submodule update --init --recursive && \
-    mkdir build_source && cd build_source && \
-    cmake ../cmake  -DCMAKE_INSTALL_LIBDIR=lib64 -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release && \
-    make -j$(nproc) && \
-    make install && \
-# Build onnxruntime
-    cd $MY_ROOT && \
-    pip3 install numpy wheel setuptools cython onnx && \
-    git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
-    bash onnxruntime/dockerfiles/scripts/install_common_deps.sh && \
-    ln -s cmake-* cmake-dir && \
-    source /opt/intel/openvino_2022.3.0/setupvars.sh && \
-    cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \
-    pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \
-# Clean up
-    cd ${MY_ROOT} && rm -rf onnxruntime && rm -rf Python-3.8.9 && rm -rf protobuf
-
-# Deploy stage
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-RUN adduser --uid $BUILD_UID $BUILD_USER
-RUN usermod -a -G video,users,render ${BUILD_USER}
-ENV WORKDIR_PATH /home/${BUILD_USER}
-
-WORKDIR ${WORKDIR_PATH}
-USER ${BUILD_USER}
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 5f0100fad95a2..32a4ca16b7824 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2931,8 +2931,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 ### <a name="com.microsoft.MoE"></a><a name="com.microsoft.moe">**com.microsoft.MoE**</a>
 
   Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-        usually uses top 32 experts.
+        GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+        usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
         
 
 #### Version
@@ -2946,9 +2946,11 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
 <dt><tt>k</tt> : int</dt>
 <dd>Number of top experts to select from expert pool</dd>
+<dt><tt>normalize_routing_weights</tt> : int</dt>
+<dd>Whether to normalize routing weights</dd>
 </dl>
 
-#### Inputs (4 - 6)
+#### Inputs (5 - 8)
 
 <dl>
 <dt><tt>input</tt> : T</dt>
@@ -2957,12 +2959,16 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>2D input tensor with shape (num_rows, num_experts)</dd>
 <dt><tt>fc1_experts_weights</tt> : T</dt>
 <dd>3D input tensor with shape (num_experts, hidden_size, inter_size)</dd>
-<dt><tt>fc2_experts_weights</tt> : T</dt>
-<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc2_experts_weights</tt> : T</dt>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size)</dd>
 <dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
+<dt><tt>fc3_experts_weights</tt> (optional) : T</dt>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size)</dd>
+<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
+<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 </dl>
 
 #### Outputs
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index eddc3b7873d80..bca8e17b3dfd4 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -861,7 +861,7 @@ Do not modify directly.*
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
-|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
+|MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index cef50163f68b0..41b034e9c1dcc 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -1837,14 +1837,28 @@ struct OrtApi {
 
   /** \brief Used for custom operators, get an input of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the input of the kernel. If the input is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] input index. See KernelContext_GetInputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the input is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index,
                   _Out_ const OrtValue** out);
 
   /** \brief Used for custom operators, get an output of a kernel
    *
-   * \see ::OrtCustomOp
+   * The function attempts fetches the output of the kernel. If the output is optional
+   * and not present, the function returns success and out is set to nullptr.
+   *
+   * \param[in] context ::OrtKernelContext instance
+   * \param[in] output index. See KernelContext_GetOutputCount for boundaries check.
+   * \param[in, out] returns a ptr to OrtValue if the output is present
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
                   _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index ae4c4bef90c64..60540514fbfa6 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2055,7 +2055,11 @@ struct KernelContext {
   explicit KernelContext(OrtKernelContext* context);
   size_t GetInputCount() const;
   size_t GetOutputCount() const;
+  // If input is optional and is not present, the method returns en empty ConstValue
+  // which can be compared to nullptr.
   ConstValue GetInput(size_t index) const;
+  // If outout is optional and is not present, the method returns en empty UnownedValue
+  // which can be compared to nullptr.
   UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
   UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
   void* GetGPUComputeStream() const;
diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts
index 3e1e833addb91..e90efd7b97c29 100644
--- a/js/common/lib/backend-impl.ts
+++ b/js/common/lib/backend-impl.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {Backend} from './backend.js';
+import {InferenceSession} from './inference-session.js';
 
 interface BackendInfo {
   backend: Backend;
@@ -10,6 +11,7 @@ interface BackendInfo {
   initPromise?: Promise<void>;
   initialized?: boolean;
   aborted?: boolean;
+  error?: string;
 }
 
 const backends: Map<string, BackendInfo> = new Map();
@@ -60,43 +62,100 @@ export const registerBackend = (name: string, backend: Backend, priority: number
 };
 
 /**
- * Resolve backend by specified hints.
+ * Try to resolve and initialize a backend.
  *
- * @param backendHints - a list of execution provider names to lookup. If omitted use registered backends as list.
- * @returns a promise that resolves to the backend.
+ * @param backendName - the name of the backend.
+ * @returns the backend instance if resolved and initialized successfully, or an error message if failed.
+ */
+const tryResolveAndInitializeBackend = async(backendName: string): Promise<Backend|string> => {
+  const backendInfo = backends.get(backendName);
+  if (!backendInfo) {
+    return 'backend not found.';
+  }
+
+  if (backendInfo.initialized) {
+    return backendInfo.backend;
+  } else if (backendInfo.aborted) {
+    return backendInfo.error!;
+  } else {
+    const isInitializing = !!backendInfo.initPromise;
+    try {
+      if (!isInitializing) {
+        backendInfo.initPromise = backendInfo.backend.init(backendName);
+      }
+      await backendInfo.initPromise;
+      backendInfo.initialized = true;
+      return backendInfo.backend;
+    } catch (e) {
+      if (!isInitializing) {
+        backendInfo.error = `${e}`;
+        backendInfo.aborted = true;
+      }
+      return backendInfo.error!;
+    } finally {
+      delete backendInfo.initPromise;
+    }
+  }
+};
+
+/**
+ * Resolve execution providers from the specific session options.
+ *
+ * @param options - the session options object.
+ * @returns a promise that resolves to a tuple of an initialized backend instance and a session options object with
+ * filtered EP list.
  *
  * @ignore
  */
-export const resolveBackend = async(backendHints: readonly string[]): Promise<Backend> => {
-  const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
-  const errors = [];
-  for (const backendName of backendNames) {
-    const backendInfo = backends.get(backendName);
-    if (backendInfo) {
-      if (backendInfo.initialized) {
-        return backendInfo.backend;
-      } else if (backendInfo.aborted) {
-        continue;  // current backend is unavailable; try next
-      }
+export const resolveBackendAndExecutionProviders = async(options: InferenceSession.SessionOptions):
+    Promise<[backend: Backend, options: InferenceSession.SessionOptions]> => {
+      // extract backend hints from session options
+      const eps = options.executionProviders || [];
+      const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
+      const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints;
 
-      const isInitializing = !!backendInfo.initPromise;
-      try {
-        if (!isInitializing) {
-          backendInfo.initPromise = backendInfo.backend.init(backendName);
+      // try to resolve and initialize all requested backends
+      let backend: Backend|undefined;
+      const errors = [];
+      const availableBackendNames = new Set<string>();
+      for (const backendName of backendNames) {
+        const resolveResult = await tryResolveAndInitializeBackend(backendName);
+        if (typeof resolveResult === 'string') {
+          errors.push({name: backendName, err: resolveResult});
+        } else {
+          if (!backend) {
+            backend = resolveResult;
+          }
+          if (backend === resolveResult) {
+            availableBackendNames.add(backendName);
+          }
         }
-        await backendInfo.initPromise;
-        backendInfo.initialized = true;
-        return backendInfo.backend;
-      } catch (e) {
-        if (!isInitializing) {
-          errors.push({name: backendName, err: e});
+      }
+
+      // if no backend is available, throw error.
+      if (!backend) {
+        throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
+      }
+
+      // for each explicitly requested backend, if it's not available, output warning message.
+      for (const {name, err} of errors) {
+        if (backendHints.includes(name)) {
+          // eslint-disable-next-line no-console
+          console.warn(`removing requested execution provider "${
+              name}" from session options because it is not available: ${err}`);
         }
-        backendInfo.aborted = true;
-      } finally {
-        delete backendInfo.initPromise;
       }
-    }
-  }
 
-  throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`);
-};
+      const filteredEps = eps.filter(i => availableBackendNames.has(typeof i === 'string' ? i : i.name));
+
+      return [
+        backend, new Proxy(options, {
+          get: (target, prop) => {
+            if (prop === 'executionProviders') {
+              return filteredEps;
+            }
+            return Reflect.get(target, prop);
+          }
+        })
+      ];
+    };
diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
index 9bfcb12206057..8c07bdd5c5c4a 100644
--- a/js/common/lib/backend.ts
+++ b/js/common/lib/backend.ts
@@ -58,7 +58,7 @@ export interface TrainingSessionHandler extends SessionHandler {
       options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;
 
   getParametersSize(trainableOnly: boolean): Promise<number>;
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
   getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
 }
 
@@ -77,8 +77,8 @@ export interface Backend {
       Promise<InferenceSessionHandler>;
 
   createTrainingSessionHandler?
-      (checkpointStateUriOrBuffer: TrainingSession.URIorBuffer, trainModelUriOrBuffer: TrainingSession.URIorBuffer,
-       evalModelUriOrBuffer: TrainingSession.URIorBuffer, optimizerModelUriOrBuffer: TrainingSession.URIorBuffer,
+      (checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer, trainModelUriOrBuffer: TrainingSession.UriOrBuffer,
+       evalModelUriOrBuffer: TrainingSession.UriOrBuffer, optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer,
        options: InferenceSession.SessionOptions): Promise<TrainingSessionHandler>;
 }
 
diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index dd8bde2b596f4..c8df1613b3268 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -166,16 +166,20 @@ export declare namespace Env {
      */
     forceFallbackAdapter?: boolean;
     /**
-     * Get the adapter for WebGPU.
+     * Set or get the adapter for WebGPU.
      *
-     * This property is only available after the first WebGPU inference session is created.
+     * Setting this property only has effect before the first WebGPU inference session is created. The value will be
+     * used as the GPU adapter for the underlying WebGPU backend to create GPU device.
+     *
+     * If this property is not set, it will be available to get after the first WebGPU inference session is created. The
+     * value will be the GPU adapter that created by the underlying WebGPU backend.
      *
      * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types".
      * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type.
      *
-     * see comments on {@link GpuBufferType}
+     * see comments on {@link Tensor.GpuBufferType}
      */
-    readonly adapter: unknown;
+    adapter: unknown;
     /**
      * Get the device for WebGPU.
      *
@@ -184,7 +188,7 @@ export declare namespace Env {
      * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types".
      * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type.
      *
-     * see comments on {@link GpuBufferType} for more details about why not use types defined in "@webgpu/types".
+     * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types".
      */
     readonly device: unknown;
     /**
diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts
index d7c98380f3fa4..3ed56b3c2e812 100644
--- a/js/common/lib/index.ts
+++ b/js/common/lib/index.ts
@@ -11,7 +11,7 @@
  * - [onnxruntime-react-native](https://www.npmjs.com/package/onnxruntime-react-native)
  *
  * See also:
- * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript.html)
+ * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript/)
  * - [Inference examples](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/js)
  *
  * @packageDocumentation
@@ -21,6 +21,9 @@ export * from './backend.js';
 export * from './env.js';
 export * from './inference-session.js';
 export * from './tensor.js';
+export * from './tensor-conversion.js';
+export * from './tensor-factory.js';
 export * from './trace.js';
+export * from './onnx-model.js';
 export * from './onnx-value.js';
 export * from './training-session.js';
diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts
index 55f40c8907a89..ab4c6a3e0c46b 100644
--- a/js/common/lib/inference-session-impl.ts
+++ b/js/common/lib/inference-session-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {InferenceSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSessionInterface} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
@@ -195,11 +195,9 @@ export class InferenceSession implements InferenceSessionInterface {
       throw new TypeError('Unexpected argument[0]: must be \'path\' or \'buffer\'.');
     }
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
-    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, options);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
+    const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, optionsWithValidatedEPs);
     TRACE_FUNC_END();
     return new InferenceSession(handler);
   }
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index 4f85c3b46e253..4f7fbdcdcf0ca 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -186,22 +186,22 @@ export declare namespace InferenceSession {
   // #region execution providers
 
   // Currently, we have the following backends to support execution providers:
-  // Backend Node.js binding: supports 'cpu' and 'cuda'.
+  // Backend Node.js binding: supports 'cpu', 'dml' (win32), 'coreml' (macOS) and 'cuda' (linux).
   // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'.
   // Backend ONNX.js: supports 'webgl'.
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
+    coreml: CoreMLExecutionProviderOption;
     cpu: CpuExecutionProviderOption;
-    coreml: CoreMlExecutionProviderOption;
     cuda: CudaExecutionProviderOption;
     dml: DmlExecutionProviderOption;
+    nnapi: NnapiExecutionProviderOption;
     tensorrt: TensorRtExecutionProviderOption;
     wasm: WebAssemblyExecutionProviderOption;
     webgl: WebGLExecutionProviderOption;
-    xnnpack: XnnpackExecutionProviderOption;
     webgpu: WebGpuExecutionProviderOption;
     webnn: WebNNExecutionProviderOption;
-    nnapi: NnapiExecutionProviderOption;
+    xnnpack: XnnpackExecutionProviderOption;
   }
 
   type ExecutionProviderName = keyof ExecutionProviderOptionMap;
@@ -219,10 +219,6 @@ export declare namespace InferenceSession {
     readonly name: 'cuda';
     deviceId?: number;
   }
-  export interface CoreMlExecutionProviderOption extends ExecutionProviderOption {
-    readonly name: 'coreml';
-    coreMlFlags?: number;
-  }
   export interface DmlExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'dml';
     deviceId?: number;
@@ -253,8 +249,39 @@ export declare namespace InferenceSession {
   }
   export interface CoreMLExecutionProviderOption extends ExecutionProviderOption {
     readonly name: 'coreml';
+    /**
+     * The bit flags for CoreML execution provider.
+     *
+     * ```
+     * COREML_FLAG_USE_CPU_ONLY = 0x001
+     * COREML_FLAG_ENABLE_ON_SUBGRAPH = 0x002
+     * COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004
+     * COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008
+     * COREML_FLAG_CREATE_MLPROGRAM = 0x010
+     * ```
+     *
+     * See include/onnxruntime/core/providers/coreml/coreml_provider_factory.h for more details.
+     *
+     * This flag is available only in ONNXRuntime (Node.js binding).
+     */
+    coreMlFlags?: number;
+    /**
+     * Specify whether to use CPU only in CoreML EP.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     useCPUOnly?: boolean;
+    /**
+     * Specify whether to enable CoreML EP on subgraph.
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     enableOnSubgraph?: boolean;
+    /**
+     * Specify whether to only enable CoreML EP for Apple devices with ANE (Apple Neural Engine).
+     *
+     * This setting is available only in ONNXRuntime (react-native).
+     */
     onlyEnableDeviceWithANE?: boolean;
   }
   export interface NnapiExecutionProviderOption extends ExecutionProviderOption {
diff --git a/js/common/lib/onnx-value.ts b/js/common/lib/onnx-value.ts
index a16a30d25d839..72369ce8b4209 100644
--- a/js/common/lib/onnx-value.ts
+++ b/js/common/lib/onnx-value.ts
@@ -3,7 +3,7 @@
 
 import {Tensor} from './tensor.js';
 
-type NonTensorType = never;
+export type NonTensorType = never;
 
 /**
  * Type OnnxValue Represents both tensors and non-tensors value for model's inputs/outputs.
diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts
index 6e19d7fb898a3..431de4c3635c2 100644
--- a/js/common/lib/tensor-factory.ts
+++ b/js/common/lib/tensor-factory.ts
@@ -253,7 +253,7 @@ export interface TensorFactory {
   /**
    * create a tensor from an ImageBitmap object
    *
-   * @param bitMap - the ImageBitmap object to create tensor from
+   * @param bitmap - the ImageBitmap object to create tensor from
    * @param options - An optional object representing options for creating tensor from URL.
    *
    * The following default settings will be applied:
diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts
index d5da33640dc7d..20319ebb800c2 100644
--- a/js/common/lib/tensor.ts
+++ b/js/common/lib/tensor.ts
@@ -160,7 +160,7 @@ export interface Tensor extends TypedTensorBase<Tensor.Type>, TypedTensorUtils<T
 /**
  * type TensorConstructor defines the constructors of 'Tensor' to create CPU tensor instances.
  */
-export interface TensorConstructor {
+export interface TensorConstructor extends TensorFactory {
   // #region CPU tensor - specify element type
   /**
    * Construct a new string tensor object from the given type, data and dims.
@@ -326,4 +326,4 @@ export interface TensorConstructor {
 }
 
 // eslint-disable-next-line @typescript-eslint/naming-convention
-export const Tensor = TensorImpl as (TensorConstructor & TensorFactory);
+export const Tensor = TensorImpl as TensorConstructor;
diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts
index 7e0487b350198..44ad6cacb4bb4 100644
--- a/js/common/lib/trace.ts
+++ b/js/common/lib/trace.ts
@@ -3,6 +3,9 @@
 
 import {env} from './env-impl.js';
 
+/**
+ * @ignore
+ */
 export const TRACE = (deviceType: string, label: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
@@ -29,6 +32,9 @@ const TRACE_FUNC = (msg: string, extraMsg?: string) => {
   }
 };
 
+/**
+ * @ignore
+ */
 export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
@@ -36,6 +42,9 @@ export const TRACE_FUNC_BEGIN = (extraMsg?: string) => {
   TRACE_FUNC('BEGIN', extraMsg);
 };
 
+/**
+ * @ignore
+ */
 export const TRACE_FUNC_END = (extraMsg?: string) => {
   if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) {
     return;
diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
index 23bd4421ae672..bae38b0dfda5a 100644
--- a/js/common/lib/training-session-impl.ts
+++ b/js/common/lib/training-session-impl.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {resolveBackend} from './backend-impl.js';
+import {resolveBackendAndExecutionProviders} from './backend-impl.js';
 import {SessionHandler, TrainingSessionHandler} from './backend.js';
 import {InferenceSession as InferenceSession} from './inference-session.js';
 import {OnnxValue} from './onnx-value.js';
@@ -55,13 +55,12 @@ export class TrainingSession implements TrainingSessionInterface {
     const optimizerModel: string|Uint8Array = trainingOptions.optimizerModel || '';
     const options: SessionOptions = sessionOptions || {};
 
-    // get backend hints
-    const eps = options.executionProviders || [];
-    const backendHints = eps.map(i => typeof i === 'string' ? i : i.name);
-    const backend = await resolveBackend(backendHints);
+    // resolve backend, update session options with validated EPs, and create session handler
+    const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options);
     if (backend.createTrainingSessionHandler) {
       const handler = await backend.createTrainingSessionHandler(
-          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel, options);
+          trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel,
+          optionsWithValidatedEPs);
       return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel);
     } else {
       throw new Error(noBackendErrMsg);
diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
index e54aed90e702c..f9de77e3ac7d0 100644
--- a/js/common/lib/training-session.ts
+++ b/js/common/lib/training-session.ts
@@ -11,7 +11,7 @@ export declare namespace TrainingSession {
   /**
    * Either URI file path (string) or Uint8Array containing model or checkpoint information.
    */
-  type URIorBuffer = string|Uint8Array;
+  type UriOrBuffer = string|Uint8Array;
 }
 
 /**
@@ -98,13 +98,13 @@ export interface TrainingSession {
   getParametersSize(trainableOnly: boolean): Promise<number>;
 
   /**
-   * Copies parameter values from the given array to the training state. Currently, only supporting models with
+   * Copies parameter values from the given buffer to the training state. Currently, only supporting models with
    * parameters of type Float32.
    *
-   * @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
+   * @param buffer - A Uint8Array representation of Float32 parameters.
    * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
    */
-  loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
+  loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise<void>;
 
   /**
    * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
@@ -157,19 +157,19 @@ export interface TrainingSessionCreateOptions {
   /**
    * URI or buffer for a .ckpt file that contains the checkpoint for the training model.
    */
-  checkpointState: TrainingSession.URIorBuffer;
+  checkpointState: TrainingSession.UriOrBuffer;
   /**
    * URI or buffer for the .onnx training file.
    */
-  trainModel: TrainingSession.URIorBuffer;
+  trainModel: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx optimizer model file.
    */
-  optimizerModel?: TrainingSession.URIorBuffer;
+  optimizerModel?: TrainingSession.UriOrBuffer;
   /**
    * Optional. URI or buffer for the .onnx eval model file.
    */
-  evalModel?: TrainingSession.URIorBuffer;
+  evalModel?: TrainingSession.UriOrBuffer;
 }
 
 /**
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 2d7c39c86097f..62b47698a1438 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -30,7 +30,7 @@
       "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@protobufjs/aspromise": {
@@ -336,9 +336,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1242,9 +1242,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "form-data": {
@@ -1503,7 +1503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "parse-json": {
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 5dd715191c830..56925b728e9a3 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -16,20 +16,97 @@ export declare namespace JSEP {
   type CaptureBeginFunction = () => void;
   type CaptureEndFunction = () => void;
   type ReplayFunction = () => void;
-}
 
-export interface OrtWasmModule extends EmscriptenModule {
-  // #region emscripten functions
-  stackSave(): number;
-  stackRestore(stack: number): void;
-  stackAlloc(size: number): number;
-
-  UTF8ToString(offset: number, maxBytesToRead?: number): string;
-  lengthBytesUTF8(str: string): number;
-  stringToUTF8(str: string, offset: number, maxBytes: number): void;
-  // #endregion
+  export interface Module extends WebGpuModule {
+    /**
+     * Mount the external data file to an internal map, which will be used during session initialization.
+     *
+     * @param externalDataFilePath - specify the relative path of the external data file.
+     * @param externalDataFileData - specify the content data.
+     */
+    mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+    /**
+     * Unmount all external data files from the internal map.
+     */
+    unmountExternalData(): void;
+
+    /**
+     * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per
+     * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and
+     * registers a few callbacks that will be called in C++ code.
+     */
+    jsepInit(name: 'webgpu', initParams: [
+      backend: BackendType, alloc: AllocFunction, free: FreeFunction, upload: UploadFunction,
+      download: DownloadFunction, createKernel: CreateKernelFunction, releaseKernel: ReleaseKernelFunction,
+      run: RunFunction, captureBegin: CaptureBeginFunction, captureEnd: CaptureEndFunction, replay: ReplayFunction
+    ]): void;
+    jsepInit(name: 'webnn', initParams?: never): void;
+  }
+
+  export interface WebGpuModule {
+    /**
+     * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
+     *
+     * @param context - specify the kernel context pointer.
+     * @param index - specify the index of the output.
+     * @param data - specify the pointer to encoded data of type and dims.
+     */
+    _JsepOutput(context: number, index: number, data: number): number;
+    /**
+     * [exported from wasm] Get name of an operator node.
+     *
+     * @param kernel - specify the kernel pointer.
+     * @returns the pointer to a C-style UTF8 encoded string representing the node name.
+     */
+    _JsepGetNodeName(kernel: number): number;
+
+    /**
+     * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
+     *
+     * @param sessionId - specify the session ID.
+     * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
+     *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
+     *     corresponding to the session's ouputNames.
+     * @param buffer - specify the GPU buffer to register.
+     * @param size - specify the original data size in byte.
+     * @returns the GPU data ID for the registered GPU buffer.
+     */
+    jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
+    /**
+     * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
+     *
+     * @param dataId - specify the GPU data ID
+     * @returns the GPU buffer.
+     */
+    jsepGetBuffer: (dataId: number) => GPUBuffer;
+    /**
+     * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
+     *
+     * @param gpuBuffer - specify the GPU buffer
+     * @param size - specify the original data size in byte.
+     * @param type - specify the tensor type.
+     * @returns the generated downloader function.
+     */
+    jsepCreateDownloader:
+        (gpuBuffer: GPUBuffer, size: number,
+         type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
+    /**
+     *  [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before
+     * _OrtRun[WithBinding]() is called.
+     * @param sessionId - specify the session ID.
+     */
+    jsepOnRunStart: (sessionId: number) => void;
+    /**
+     * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is
+     * called.
+     * @param sessionId - specify the session ID.
+     * @returns
+     */
+    jsepOnReleaseSession: (sessionId: number) => void;
+  }
+}
 
-  // #region ORT APIs
+export interface OrtInferenceAPIs {
   _OrtInit(numThreads: number, loggingLevel: number): number;
 
   _OrtGetLastError(errorCodeOffset: number, errorMessageOffset: number): void;
@@ -74,126 +151,61 @@ export interface OrtWasmModule extends EmscriptenModule {
   _OrtReleaseRunOptions(runOptionsHandle: number): void;
 
   _OrtEndProfiling(sessionHandle: number): number;
-  // #endregion
+}
+
+export interface OrtTrainingAPIs {
+  _OrtTrainingLoadCheckpoint(dataOffset: number, dataLength: number): number;
 
-  // #region ORT Training APIs
-  _OrtTrainingLoadCheckpoint?(dataOffset: number, dataLength: number): number;
+  _OrtTrainingReleaseCheckpoint(checkpointHandle: number): void;
 
-  _OrtTrainingReleaseCheckpoint?(checkpointHandle: number): void;
+  _OrtTrainingCreateSession(
+      sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
+      evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
 
-  _OrtTrainingCreateSession?
-      (sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number,
-       evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number;
+  _OrtTrainingLazyResetGrad(trainingHandle: number): number;
 
-  _OrtTrainingLazyResetGrad?(trainingHandle: number): number;
+  _OrtTrainingRunTrainStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingRunTrainStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingOptimizerStep(trainingHandle: number, runOptionsHandle: number): number;
 
-  _OrtTrainingOptimizerStep?(trainingHandle: number, runOptionsHandle: number): number;
+  _OrtTrainingEvalStep(
+      trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
+      runOptionsHandle: number): number;
 
-  _OrtTrainingEvalStep?
-      (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number,
-       runOptionsHandle: number): number;
+  _OrtTrainingGetParametersSize(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
 
-  _OrtTrainingGetParametersSize?(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersToBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersToBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingCopyParametersFromBuffer(
+      trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
 
-  _OrtTrainingCopyParametersFromBuffer?
-      (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number;
+  _OrtTrainingGetModelInputOutputCount(
+      trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
+  _OrtTrainingGetModelInputOutputName(trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean):
+      number;
+
+  _OrtTrainingReleaseSession(trainingHandle: number): void;
+}
 
-  _OrtTrainingGetModelInputOutputCount?
-      (trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number;
-  _OrtTrainingGetModelInputOutputName?
-      (trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean): number;
+export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial<OrtTrainingAPIs>,
+                                       Partial<JSEP.Module> {
+  // #region emscripten functions
+  stackSave(): number;
+  stackRestore(stack: number): void;
+  stackAlloc(size: number): number;
 
-  _OrtTrainingReleaseSession?(trainingHandle: number): void;
+  UTF8ToString(offset: number, maxBytesToRead?: number): string;
+  lengthBytesUTF8(str: string): number;
+  stringToUTF8(str: string, offset: number, maxBytes: number): void;
   // #endregion
 
   // #region config
   numThreads?: number;
   mainScriptUrlOrBlob?: string|Blob;
   // #endregion
-
-  // #region external data API
-  mountExternalData?(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
-  unmountExternalData?(): void;
-  // #endregion
-
-  // #region JSEP
-  /**
-   * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime.
-   * This function initializes WebGPU backend and registers a few callbacks that will be called in C++ code.
-   */
-  jsepInit?
-      (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction,
-       download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction,
-       releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction, captureBegin: JSEP.CaptureBeginFunction,
-       captureEnd: JSEP.CaptureEndFunction, replay: JSEP.ReplayFunction): void;
-
-  /**
-   * [exported from wasm] Specify a kernel's output when running OpKernel::Compute().
-   *
-   * @param context - specify the kernel context pointer.
-   * @param index - specify the index of the output.
-   * @param data - specify the pointer to encoded data of type and dims.
-   */
-  _JsepOutput(context: number, index: number, data: number): number;
-  /**
-   * [exported from wasm] Get name of an operator node.
-   *
-   * @param kernel - specify the kernel pointer.
-   * @returns the pointer to a C-style UTF8 encoded string representing the node name.
-   */
-  _JsepGetNodeName(kernel: number): number;
-
-  /**
-   * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output.
-   *
-   * @param sessionId - specify the session ID.
-   * @param index - specify an integer to represent which input/output it is registering for. For input, it is the
-   *     input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index
-   *     corresponding to the session's ouputNames.
-   * @param buffer - specify the GPU buffer to register.
-   * @param size - specify the original data size in byte.
-   * @returns the GPU data ID for the registered GPU buffer.
-   */
-  jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number;
-  /**
-   * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID.
-   *
-   * @param dataId - specify the GPU data ID
-   * @returns the GPU buffer.
-   */
-  jsepGetBuffer: (dataId: number) => GPUBuffer;
-  /**
-   * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor.
-   *
-   * @param gpuBuffer - specify the GPU buffer
-   * @param size - specify the original data size in byte.
-   * @param type - specify the tensor type.
-   * @returns the generated downloader function.
-   */
-  jsepCreateDownloader:
-      (gpuBuffer: GPUBuffer, size: number,
-       type: Tensor.GpuBufferDataTypes) => () => Promise<Tensor.DataTypeMap[Tensor.GpuBufferDataTypes]>;
-  /**
-   *  [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before
-   * _OrtRun[WithBinding]() is called.
-   * @param sessionId - specify the session ID.
-   */
-  jsepOnRunStart: (sessionId: number) => void;
-  /**
-   * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is
-   * called.
-   * @param sessionId - specify the session ID.
-   * @returns
-   */
-  jsepOnReleaseSession: (sessionId: number) => void;
-  // #endregion
 }
 
 declare const moduleFactory: EmscriptenModuleFactory<OrtWasmModule>;
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index d92b8ac68dbe7..b36dc73330d46 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -252,8 +252,10 @@ export class WebGpuBackend {
       }
     };
 
-    Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
-    Object.defineProperty(this.env.webgpu, 'adapter', {value: adapter});
+    Object.defineProperty(
+        this.env.webgpu, 'device', {value: this.device, writable: false, enumerable: true, configurable: false});
+    Object.defineProperty(
+        this.env.webgpu, 'adapter', {value: adapter, writable: false, enumerable: true, configurable: false});
 
     // init queryType, which is necessary for InferenceSession.create
     this.setQueryType();
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 4936b94ef7a86..adcaa145cdca8 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -121,7 +121,7 @@ class ComputeContextImpl implements ComputeContext {
       for (let i = 0; i < dims.length; i++) {
         this.module.HEAPU32[offset++] = dims[i];
       }
-      return this.module._JsepOutput(this.opKernelContext, index, data);
+      return this.module._JsepOutput!(this.opKernelContext, index, data);
     } catch (e) {
       throw new Error(
           `Failed to generate kernel's output[${index}] with dims [${dims}]. ` +
@@ -136,27 +136,39 @@ class ComputeContextImpl implements ComputeContext {
 /**
  * Initialize JSEP with WebGPU backend.
  *
- * This function will be called only once after the WebAssembly module is loaded and initialized ("_OrtInit" is called).
- * This function expects:
+ * This function will be called after the WebAssembly module is loaded and initialized ("_OrtInit" is called), once for
+ * each of the following EPs if they are specified:
+ * - "webgpu"
+ * - "webnn"
+ *
+ * For WebGPU, this function expects:
  *  - WebGPU is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
  *  - WebGPU is available in current environment. (a valid GPUAdapter is passed in)
+ *
+ * For WebNN, this function expects:
+ * - WebNN is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false).
+ * - WebNN is available in current environment. (navigator.ml is not undefined)
+ *
  * If the WebAssembly module is not built with JSEP support, this function will throw an error. This will invalidate
- * 'webgpu' backend.
+ * 'webgpu'/'webnn' backend.
  *
+ * @param name - the name of the EP, either "webgpu" or "webnn"
  * @param module - the ORT WebAssembly module
  * @param env - the ORT environment variable (ort.env)
  * @param gpuAdapter - the pre-created GPU adapter
  */
-export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapter): Promise<void> => {
+export const init =
+    async(name: 'webgpu'|'webnn', module: OrtWasmModule, env: Env, gpuAdapter?: GPUAdapter): Promise<void> => {
   const jsepInit = module.jsepInit;
   if (!jsepInit) {
     throw new Error('Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.');
   }
 
-  const backend = new WebGpuBackend();
-  await backend.initialize(env, gpuAdapter);
+  if (name === 'webgpu') {
+    const backend = new WebGpuBackend();
+    await backend.initialize(env, gpuAdapter!);
 
-  jsepInit(
+    jsepInit('webgpu', [
       // backend
       backend,
 
@@ -190,8 +202,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           },
 
       // jsepCreateKernel
-      (kernelType: string, kernelId: number, attribute: unknown) =>
-          backend.createKernel(kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName(kernelId))),
+      (kernelType: string, kernelId: number, attribute: unknown) => backend.createKernel(
+          kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName!(kernelId))),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),
@@ -210,5 +222,9 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
       // jsepCaptureEnd
       () => backend.captureEnd(),
       // jsepReplay
-      () => backend.replay());
+      () => backend.replay()
+    ]);
+  } else {
+    jsepInit('webnn');
+  }
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 2f652dbd310ab..2c72def089144 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -207,7 +207,7 @@ const computeMean =
     let offset = currentImageNumber * uniforms.image_size;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
-    for (var i: u32 = 0; i < ${WG}; i++) {
+    for (var i: u32 = 0; i < min(${WG}, uniforms.H); i++) {
         let value = input[offset + i + currentChannelNumber * ${WG}];
         sum += value[0];
         squaredSum += value[1];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 4e933573b9137..5521650e8ded4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -381,8 +381,9 @@ const createMaxPoolProgramInfo =
           programUniforms
         }),
         getShaderSource: shaderHelper => generatePoolingCode(
-            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2, -1e5, uniforms,
-            hasPads, pwStartEndNotZero, phStartEndNotZero),
+            shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2,
+            (input.dataType === DataType.float16) ? -65504 : -1e5, uniforms, hasPads, pwStartEndNotZero,
+            phStartEndNotZero),
       };
     };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
index a9b28d7c034f3..210b3ee7e2fca 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts
@@ -131,7 +131,7 @@ export const createReduceSharedProgramInfo =
       const workgroupSize = 32;
 
       const sharedMemorySnippet = `
-          var<workgroup> aBestValues : array<${output.type.storage}, ${workgroupSize}>;
+          var<workgroup> aBestValues : array<f32, ${workgroupSize}>;
        `;
 
       const getShaderSource = (shaderHelper: ShaderHelper) => `
@@ -145,10 +145,10 @@ export const createReduceSharedProgramInfo =
           let outputIndex = global_idx / ${workgroupSize};
           let offset = outputIndex * uniforms.reduceSize;
 
-          var bestValue = ${output.type.storage}(${reduceInitValues[reduceType]});
+          var bestValue = f32(${reduceInitValues[reduceType]});
           let Length = uniforms.reduceSize;
           for (var k = local_idx; k < Length; k = k + ${workgroupSize}) {
-           let candidate = ${output.type.storage}(${input.getByOffset('offset + k')});
+           let candidate = f32(${input.getByOffset('offset + k')});
            bestValue = ${reduceOps[reduceType]};
           }
           aBestValues[local_idx] = bestValue;
@@ -172,8 +172,8 @@ export const createReduceSharedProgramInfo =
           output.setByOffset(
               'outputIndex',
               `${
-                  reduceType === 'mean' ? `bestValue / ${output.type.storage}(uniforms.reduceSize)` :
-                                          `${reduceOutputValues[reduceType]}`}`)};
+                  reduceType === 'mean' ? `${output.type.storage}(bestValue / f32(uniforms.reduceSize))` :
+                                          `${output.type.storage}(${reduceOutputValues[reduceType]})`}`)};
          }
         }`;
 
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 86017a4ec6904..6ff4e86b1235e 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -155,7 +155,7 @@ export const createSession =
             ensureWorker();
             return new Promise<SerializableSessionMetadata>((resolve, reject) => {
               enqueueCallbacks('create', [resolve, reject]);
-              const message: OrtWasmMessage = {type: 'create', in : {model, options}};
+              const message: OrtWasmMessage = {type: 'create', in : {model, options: {...options}}};
               const transferable: Transferable[] = [];
               if (model instanceof Uint8Array) {
                 transferable.push(model.buffer);
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index afab9ba00b0c4..9b27051f1b9fe 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -84,35 +84,57 @@ export const initRuntime = async(env: Env): Promise<void> => {
  * @param epName
  */
 export const initEp = async(env: Env, epName: string): Promise<void> => {
-  if (!BUILD_DEFS.DISABLE_WEBGPU && (epName === 'webgpu' || epName === 'webnn')) {
-    // perform WebGPU availability check
-    if (typeof navigator === 'undefined' || !navigator.gpu) {
-      throw new Error('WebGPU is not supported in current environment');
-    }
-    const powerPreference = env.webgpu?.powerPreference;
-    if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') {
-      throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
-    }
-    const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter;
-    if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
-      throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
-    }
-    const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
-    if (!adapter) {
-      throw new Error(
-          'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
-    }
+  if (!BUILD_DEFS.DISABLE_WEBGPU) {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+    const initJsep = require('./jsep/init').init;
 
-    if (!env.wasm.simd) {
-      throw new Error(
-          'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
-    }
+    if (epName === 'webgpu') {
+      // perform WebGPU availability check
+      if (typeof navigator === 'undefined' || !navigator.gpu) {
+        throw new Error('WebGPU is not supported in current environment');
+      }
+
+      let adapter = env.webgpu.adapter as GPUAdapter | null;
+      if (!adapter) {
+        // if adapter is not set, request a new adapter.
+        const powerPreference = env.webgpu.powerPreference;
+        if (powerPreference !== undefined && powerPreference !== 'low-power' &&
+            powerPreference !== 'high-performance') {
+          throw new Error(`Invalid powerPreference setting: "${powerPreference}"`);
+        }
+        const forceFallbackAdapter = env.webgpu.forceFallbackAdapter;
+        if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') {
+          throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`);
+        }
+        adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter});
+        if (!adapter) {
+          throw new Error(
+              'Failed to get GPU adapter. ' +
+              'You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.');
+        }
+      } else {
+        // if adapter is set, validate it.
+        if (typeof adapter.limits !== 'object' || typeof adapter.features !== 'object' ||
+            typeof adapter.requestDevice !== 'function') {
+          throw new Error('Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.');
+        }
+      }
 
-    // init JSEP if available
+      if (!env.wasm.simd) {
+        throw new Error(
+            'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP');
+      }
 
-    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
-    const initJsep = require('./jsep/init').init;
-    await initJsep(getInstance(), env, adapter);
+      await initJsep('webgpu', getInstance(), env, adapter);
+    }
+    if (epName === 'webnn') {
+      // perform WebNN availability check
+      if (typeof navigator === 'undefined' || !(navigator as unknown as {ml: unknown}).ml) {
+        throw new Error('WebNN is not supported in current environment');
+      }
+
+      await initJsep('webnn', getInstance(), env);
+    }
   }
 };
 
@@ -380,7 +402,12 @@ export const prepareInputOutputTensor =
         const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer;
         const elementSizeInBytes = getTensorElementSize(tensorDataTypeStringToEnum(dataType))!;
         dataByteLength = dims.reduce((a, b) => a * b, 1) * elementSizeInBytes;
-        rawData = wasm.jsepRegisterBuffer(sessionId, index, gpuBuffer, dataByteLength);
+
+        const registerBuffer = wasm.jsepRegisterBuffer;
+        if (!registerBuffer) {
+          throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');
+        }
+        rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength);
       } else {
         const data = tensor[2];
 
@@ -595,7 +622,11 @@ export const run = async(
           // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU
           // tensor for it. There is no mapping GPU buffer for an empty tensor.
           if (preferredLocation === 'gpu-buffer' && size > 0) {
-            const gpuBuffer = wasm.jsepGetBuffer(dataOffset);
+            const getBuffer = wasm.jsepGetBuffer;
+            if (!getBuffer) {
+              throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
+            }
+            const gpuBuffer = getBuffer(dataOffset);
             const elementSize = getTensorElementSize(dataType);
             if (elementSize === undefined || !isGpuBufferSupportedType(type)) {
               throw new Error(`Unsupported data type: ${type}`);
@@ -607,7 +638,7 @@ export const run = async(
             output.push([
               type, dims, {
                 gpuBuffer,
-                download: wasm.jsepCreateDownloader(gpuBuffer, size * elementSize, type),
+                download: wasm.jsepCreateDownloader!(gpuBuffer, size * elementSize, type),
                 dispose: () => {
                   wasm._OrtReleaseTensor(tensor);
                 }
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 41c44aaa2679b..5c9113459ff06 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -52,7 +52,7 @@
       "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "node_modules/@chiragrupani/karma-chromium-edge-launcher": {
@@ -1351,9 +1351,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -4595,9 +4595,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "from": {
@@ -5503,7 +5503,7 @@
     "onnxruntime-common": {
       "version": "file:../common",
       "requires": {
-        "typedoc": "^0.23.22"
+        "typedoc": "^0.25.7"
       }
     },
     "p-cancelable": {
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index e89ac2da3795f..f28b016d47ab9 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -224,5 +224,85 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Simple test with NHWC, components 1, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 4, 5, 6],
+            "dims": [2, 3, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2, buffer reuse",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": {
+      "domain": "",
+      "version": 17
+    },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539,
+              16.348413467407227, 9, 1.6515865325927734
+            ],
+            "dims": [1, 6, 1, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/objectivec/ort_value.mm b/objectivec/ort_value.mm
index b9dc1a9885c61..c61a7ea809237 100644
--- a/objectivec/ort_value.mm
+++ b/objectivec/ort_value.mm
@@ -148,6 +148,9 @@ - (nullable ORTValueTypeInfo*)typeInfoWithError:(NSError**)error {
 - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     return CXXAPIToPublicTensorTypeAndShapeInfo(tensorTypeAndShapeInfo);
   }
   ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
@@ -156,6 +159,9 @@ - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError*
 - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     if (tensorTypeAndShapeInfo.GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
       ORT_CXX_API_THROW(
           "This ORTValue holds string data. Please call tensorStringDataWithError: "
@@ -182,6 +188,9 @@ - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
 - (nullable NSArray<NSString*>*)tensorStringDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (!tensorTypeAndShapeInfo) {
+      ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION);
+    }
     const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount();
     const size_t tensorStringDataLength = _value->GetStringTensorDataLength();
     std::vector<char> tensorStringData(tensorStringDataLength, '\0');
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
index d3902f9bd68c7..e7df50408ef09 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -27,6 +27,7 @@
 #pragma warning(disable : 4244)
 #pragma warning(disable : 4267)
 #pragma warning(disable : 4702)
+#pragma warning(disable : 4127)
 #endif
 
 #include "bestla/bestla_prologue_a.h"
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 40a667ffd5d83..2efc37cf98010 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "core/common/safeint.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
@@ -35,6 +37,7 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 ShardedMoE<T>::ShardedMoE(const OpKernelInfo& op_kernel_info) : NcclKernel(op_kernel_info), MoEBase(op_kernel_info) {
+  ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("tensor_shards", &tensor_shards_).IsOK());
   ORT_ENFORCE(op_kernel_info.GetAttr<int64_t>("local_experts_start_index", &local_experts_start_index_).IsOK());
   rank_to_experts_start_index_.resize(nccl_->Size());
   // Initialize rank_to_experts_start_index_[0] to a value to convey that it is not initialized.
@@ -55,27 +58,36 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   // Create a {Rank, ExpertsStartIndex} map on Host.
   AutoDestoryCudaEvent cuda_event;
   cudaEvent_t& copy_event = cuda_event.Get();
-  ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
 
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
+
+  MoEParameters moe_params(tensor_shards_);
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
+                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
+                                  fc3_experts_bias_optional));
 
-  MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
   ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0,
                     "num_experts should be divisible by world_size");
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  if (moe_params.parallel_type == MoEParallelType::EP || moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event));
+  }
+
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
+                                                                     fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
   size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+                                  static_cast<size_t>(moe_params.inter_size),
+                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
 
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
@@ -93,19 +105,25 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
+  const CudaT* fc_scales_ptr = nullptr;
 
   moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
                         reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
                         reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
+                        std::move(fc_scales_ptr),
                         fc1_experts_bias_optional == nullptr
                             ? nullptr
                             : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
+                        activation_type_,
+                        fc3_experts_weights_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->template Data<T>()),
+                        std::move(fc_scales_ptr),
+                        fc3_experts_bias_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+                        reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
+                        std::move(fc_scales_ptr), static_cast<int>(moe_params.num_rows),
                         static_cast<int>(moe_params.hidden_size),
                         static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
                         static_cast<int>(moe_params.local_num_experts), static_cast<int>(local_experts_start_index_),
@@ -116,31 +134,54 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
 
   Tensor* output = context->Output(0, input->Shape());
 
-  size_t stride_count = moe_params.hidden_size;
-  size_t stride_bytes = stride_count * sizeof(CudaT);
-  int64_t total_past_rows = 0;
-  int64_t total_covered_rows = 0;
-  if (copy_event != nullptr) {
-    CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+  if (moe_params.parallel_type == MoEParallelType::None) {
+    fc2_output_bc = std::move(fc2_output);
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupStart());
-  for (int rank = 0; rank < nccl_->Size(); ++rank) {
-    int64_t experts_start_index = rank_to_experts_start_index_[rank];
-    moe_runner.get_total_rows_info(experts_start_index,
-                                   moe_params.local_num_experts,
-                                   total_past_rows,
-                                   total_covered_rows);
-    const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
-    char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
-    NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
-                                       dst,
-                                       total_covered_rows * stride_count,
+
+  if (moe_params.parallel_type == MoEParallelType::EPAndTP) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expert and Tensor Parallelism is not supported yet");
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::TP) {
+    ORT_ENFORCE(moe_params.tensor_shards == nccl_->Size());
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    NCCL_RETURN_IF_ERROR(ncclAllReduce(reinterpret_cast<const char*>(fc2_output.get()),
+                                       reinterpret_cast<char*>(fc2_output_bc.get()),
+                                       fc2_output_size / sizeof(CudaT),
                                        GetNcclDataType(input->DataType()),
-                                       rank,
+                                       ncclSum,
                                        nccl_->Comm(),
                                        Stream(context)));
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  }
+
+  if (moe_params.parallel_type == MoEParallelType::EP) {
+    size_t stride_count = moe_params.hidden_size;
+    size_t stride_bytes = stride_count * sizeof(CudaT);
+    int64_t total_past_rows = 0;
+    int64_t total_covered_rows = 0;
+    if (copy_event != nullptr) {
+      CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
+    for (int rank = 0; rank < nccl_->Size(); ++rank) {
+      int64_t experts_start_index = rank_to_experts_start_index_[rank];
+      moe_runner.get_total_rows_info(experts_start_index,
+                                     moe_params.local_num_experts,
+                                     total_past_rows,
+                                     total_covered_rows);
+      const char* src = reinterpret_cast<const char*>(fc2_output.get()) + total_past_rows * stride_bytes;
+      char* dst = reinterpret_cast<char*>(fc2_output_bc.get()) + total_past_rows * stride_bytes;
+      NCCL_RETURN_IF_ERROR(ncclBroadcast(src,
+                                         dst,
+                                         total_covered_rows * stride_count,
+                                         GetNcclDataType(input->DataType()),
+                                         rank,
+                                         nccl_->Comm(),
+                                         Stream(context)));
+    }
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
   }
-  NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 
   ort_fastertransformer::finalize_moe_routing_kernelLauncher(
       reinterpret_cast<CudaT*>(fc2_output_bc.get()), reinterpret_cast<CudaT*>(output->template MutableData<T>()),
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
index 5ea4ae59c4020..827283a794dd6 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h
@@ -26,6 +26,7 @@ class ShardedMoE final : public NcclKernel, public MoEBase {
   Status SynchronizeExpertsStartIndex(AllocatorPtr& alloc, OpKernelContext* ctx, cudaEvent_t& cuda_event) const;
 
   int64_t local_experts_start_index_;
+  int64_t tensor_shards_;
   std::vector<int64_t> rank_to_experts_start_index_;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
index 78d206bf1d9bc..b18a70e899d1c 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h
@@ -83,10 +83,16 @@ namespace ort_fastertransformer {
 
 struct EpilogueOpBiasSilu {};
 
+struct EpilogueOpNoBiasSilu {};
+
 struct EpilogueOpBiasReLU {};
 
+struct EpilogueOpNoBiasReLU {};
+
 struct EpilogueOpBiasFtGelu {};
 
+struct EpilogueOpNoBiasFtGelu {};
+
 struct EpilogueOpBias {};
 
 struct EpilogueOpNoBias {};
@@ -101,6 +107,13 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
                                                               cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasSilu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator,
+                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU> {
   using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
@@ -108,6 +121,13 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
                                                               cutlass::epilogue::thread::ScaleType::NoBetaScaling>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasReLU> {
+  using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+                                                              ElementAccumulator,
+                                                              cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu> {
   using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
@@ -116,6 +136,14 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
       cutlass::FloatRoundStyle::round_to_nearest, true>;
 };
 
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBiasFtGelu> {
+  using Op = cutlass::epilogue::thread::LinearCombinationGeneric<
+      cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator,
+      ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling,
+      cutlass::FloatRoundStyle::round_to_nearest, true>;
+};
+
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias> {
   using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
@@ -126,8 +154,9 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
 template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
 struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpNoBias> {
   using Op =
-      cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
-                                                   ElementAccumulator, cutlass::epilogue::thread::ScaleType::Default>;
+      cutlass::epilogue::thread::LinearCombination<
+          ElementType, ElementsPerVectorAccess, ElementAccumulator,
+          ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>;
 };
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
index 60608f462fde5..e0f91ab806c85 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h
@@ -42,8 +42,13 @@ class MoeGemmRunner {
                          int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
                          int num_experts, ActivationType activation_type, cudaStream_t stream);
 
-  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
-                int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream);
+  void moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
+                    int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+                    ActivationType activation_type, cudaStream_t stream);
+
+  void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+                int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
+                int num_experts, cudaStream_t stream);
 
  private:
   template <typename EpilogueTag>
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
index a3dcf0da16b98..2a15fdfd1cc1a 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h
@@ -311,8 +311,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig
 template <typename T, typename WeightType, typename arch, typename EpilogueTag,
           typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
 void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
-                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, int64_t gemm_k,
-                                  int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
+                                  int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n,
+                                  int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/,
                                   int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) {
   switch (gemm_config.tile_config) {
     case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
@@ -429,11 +429,47 @@ void MoeGemmRunner<T, WeightType>::moe_gemm_bias_act(const T* A, const WeightTyp
 }
 
 template <typename T, typename WeightType>
-void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C,
-                                            int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n,
-                                            int64_t gemm_k, int num_experts, cudaStream_t stream) {
-  run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+void MoeGemmRunner<T, WeightType>::moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales,
+                                                T* C, int64_t* total_rows_before_expert, int64_t total_rows,
+                                                int64_t gemm_n, int64_t gemm_k, int num_experts,
+                                                ActivationType activation_type, cudaStream_t stream) {
+  switch (activation_type) {
+    case ActivationType::Relu:
+      run_gemm<EpilogueOpNoBiasReLU>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                     gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Gelu:
+      run_gemm<EpilogueOpNoBiasFtGelu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                       gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Silu:
+      run_gemm<EpilogueOpNoBiasSilu>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n,
+                                     gemm_k, num_experts, stream);
+      break;
+    case ActivationType::Identity:
+      run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                                 num_experts, stream);
+      break;
+    case ActivationType::InvalidType:
+      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
+      break;
+    default: {
+      ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM");
+    }
+  }
+}
+
+template <typename T, typename WeightType>
+void MoeGemmRunner<T, WeightType>::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases,
+                                            T* C, int64_t* total_rows_before_expert, int64_t total_rows,
+                                            int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream) {
+  if (biases != nullptr) {
+    run_gemm<EpilogueOpBias>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
                              num_experts, stream);
+  } else {
+    run_gemm<EpilogueOpNoBias>(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+                               num_experts, stream);
+  }
 }
 
 }  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index a5b47bcddefbc..5e6e484567988 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -30,7 +30,6 @@
 
 #include "cutlass/array.h"
 #include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
@@ -49,15 +48,14 @@
 #endif
 
 namespace ort_fastertransformer {
-
 static constexpr int WARP_SIZE = 32;
 
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__
-    void moe_softmax(const T* input, const bool* finished, T* output, const int num_cols) {
+__launch_bounds__(TPB) __global__ void moe_softmax(const T* input, const bool* finished, T* output,
+                                                   const int num_cols) {
   using BlockReduce = cub::BlockReduce<float, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
 
@@ -108,14 +106,15 @@ __launch_bounds__(TPB) __global__
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
 template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, const int) {
+__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, int, bool) {
   // Does not support pre-Kepler architectures
   ;
 }
 #else
 template <typename T, int TPB>
 __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output,
-                                                 int* indices, int* source_rows, int num_experts, int k) {
+                                                 int* indices, int* source_rows, int num_experts, int k,
+                                                 bool normalize_routing_weights) {
   using cub_kvp = cub::KeyValuePair<int, T>;
   using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -128,6 +127,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 
   const bool should_process_row = finished ? !finished[block_row] : true;
   const int thread_read_offset = blockIdx.x * num_experts;
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     thread_kvp.key = 0;
     thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
@@ -155,6 +155,13 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
       output[idx] = result_kvp.value;
       indices[idx] = should_process_row ? result_kvp.key : num_experts;
       source_rows[idx] = k_idx * num_rows + block_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
     __syncthreads();
   }
@@ -178,7 +185,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
 template <typename T, int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
     void topk_gating_softmax(const T* input, const bool* finished, T* output, int num_rows, int* indices,
-                             int* source_rows, int k) {
+                             int* source_rows, int k, bool normalize_routing_weights) {
   // We begin by enforcing compile time assertions and setting up compile time constants.
   static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
   static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
@@ -296,6 +303,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
   int start_col = first_elt_read_by_thread;
   static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
 
+  float output_row_sum = 0.f;
   for (int k_idx = 0; k_idx < k; ++k_idx) {
     // First, each thread does the local argmax
     float max_val = row_chunk[0];
@@ -336,8 +344,16 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
       // single) thread per row of the input/output matrices.
       const int idx = k * thread_row + k_idx;
       output[idx] = T(max_val);
+      output_row_sum = output_row_sum + static_cast<float>(max_val);
       indices[idx] = should_process_row ? expert : NUM_EXPERTS;
       source_rows[idx] = k_idx * num_rows + thread_row;
+
+      if (normalize_routing_weights && k_idx == k - 1) {
+#pragma unroll
+        for (int ki = 0; ki < k; ++ki) {
+          output[idx - ki] = T(static_cast<float>(output[idx - ki]) / output_row_sum);
+        }
+      }
     }
 
     // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@@ -370,7 +386,8 @@ struct TopkConstants {
 
 template <typename T, int EXPERTS, int WARPS_PER_TB>
 void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row,
-                                         int num_rows, int /*num_experts*/, int k, cudaStream_t stream) {
+                                         int num_rows, int /*num_experts*/, int k, bool normalize_routing_weights,
+                                         cudaStream_t stream) {
   static constexpr unsigned long MAX_BYTES_PER_LDG = 16;
 
   static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS);
@@ -382,61 +399,63 @@ void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T
 
   dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
   topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
-      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k);
+      <<<num_blocks, block_dim, 0, stream>>>(input, finished, output, num_rows, indices, source_row, k,
+                                             normalize_routing_weights);
 }
 
 template <typename T>
 void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T* output, T* softmax_temp_output,
                                         int* indices, int* source_row, int num_rows, int num_experts,
-                                        int k, cudaStream_t stream) {
+                                        int k, bool normalize_routing_weights, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
 
   switch (num_experts) {
     case 2: {
       topk_gating_softmax_launcher_helper<T, 2, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 4: {
       topk_gating_softmax_launcher_helper<T, 4, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 8: {
       topk_gating_softmax_launcher_helper<T, 8, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                              num_experts, k, stream);
+                                                              num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 16: {
       topk_gating_softmax_launcher_helper<T, 16, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 32: {
       topk_gating_softmax_launcher_helper<T, 32, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 64: {
       topk_gating_softmax_launcher_helper<T, 64, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                               num_experts, k, stream);
+                                                               num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 128: {
       topk_gating_softmax_launcher_helper<T, 128, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     case 256: {
       topk_gating_softmax_launcher_helper<T, 256, WARPS_PER_TB>(input, finished, output, indices, source_row, num_rows,
-                                                                num_experts, k, stream);
+                                                                num_experts, k, normalize_routing_weights, stream);
       break;
     }
     default: {
       static constexpr int TPB = 256;
       moe_softmax<T, TPB><<<num_rows, TPB, 0, stream>>>(input, finished, softmax_temp_output, num_experts);
       moe_top_k<T, TPB>
-          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k);
+          <<<num_rows, TPB, 0, stream>>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k,
+                                         normalize_routing_weights);
     }
   }
 }
@@ -521,25 +540,31 @@ __global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, i
 }
 
 template <typename T, typename WeightType, typename Enable>
-CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version) {
-  total_past_rows_ = 0;
-  total_covered_rows_ = 0;
+CutlassMoeFCRunner<T, WeightType, Enable>::CutlassMoeFCRunner(int sm_version,
+                                                              bool has_fc3,
+                                                              bool normalize_routing_weights)
+    : has_fc3_(has_fc3),
+      total_past_rows_(0),
+      total_covered_rows_(0),
+      normalize_routing_weights_(normalize_routing_weights) {
   moe_gemm_runner_.initialize(sm_version);
 }
 
 template <typename T, typename WeightType, typename Enable>
-size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows, const int hidden_size,
-                                                                   const int inter_size, int num_experts,
-                                                                   int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
-  int num_softmax_outs = 0;
+size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(size_t num_rows, const size_t hidden_size,
+                                                                   const size_t inter_size, size_t num_experts,
+                                                                   size_t k) {
+  total_covered_rows_ = k * num_rows;
+
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
+  size_t num_softmax_outs = 0;
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    num_softmax_outs = static_cast<int>(pad_to_multiple_of_16(num_rows * num_experts));
+    num_softmax_outs = pad_to_multiple_of_16(num_rows * num_experts);
   }
 
   // softmax output, permuted_rows and permuted_experts have moved to outside of moe kernel, allocate them
@@ -548,13 +573,13 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
   total_ws_bytes += buf_size * sizeof(T);                    // permuted_data
   total_ws_bytes += padded_experts * sizeof(int64_t);        // Hold total_rows_before_expert_
   total_ws_bytes += num_softmax_outs * sizeof(T);
-  const int bytes_for_fc1_result = interbuf_size * sizeof(T);
-  const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows)));
-  sorter_.update_num_experts(num_experts);
+  const size_t bytes_for_fc1_result = has_fc3_ ? 2 * interbuf_size * sizeof(T) : interbuf_size * sizeof(T);
+  const size_t sorter_ws_size_bytes = pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows));
+  sorter_.update_num_experts(static_cast<int>(num_experts));
 
-  int bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
+  size_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
   if (sorter_ws_size_bytes > bytes_for_fc1_result) {
-    int remaining_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result));
+    size_t remaining_bytes = pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result);
     bytes_for_intermediate_and_sorting += remaining_bytes;
   }
 
@@ -563,13 +588,13 @@ size_t CutlassMoeFCRunner<T, WeightType, Enable>::getWorkspaceSize(int num_rows,
 }
 
 template <typename T, typename WeightType, typename Enable>
-void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, int num_rows,
-                                                                  const int hidden_size, const int inter_size,
-                                                                  int num_experts, int k) {
-  const int buf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * hidden_size));
-  const int interbuf_size = static_cast<int>(pad_to_multiple_of_16(k * num_rows * inter_size));
-  const int padded_experts = static_cast<int>(pad_to_multiple_of_16(num_experts));
-  const int num_moe_inputs = static_cast<int>(pad_to_multiple_of_16(k * num_rows));
+void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr, size_t num_rows,
+                                                                  const size_t hidden_size, const size_t inter_size,
+                                                                  size_t num_experts, size_t k) {
+  const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size);
+  const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size);
+  const size_t padded_experts = pad_to_multiple_of_16(num_experts);
+  const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows);
 
   source_rows_ = (int*)ws_ptr;
   permuted_rows_ = source_rows_ + num_moe_inputs;
@@ -578,28 +603,130 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::configure_ws_ptrs(char* ws_ptr,
 
   total_rows_before_expert_ = (int64_t*)(permuted_data_ + buf_size);
 
-  fc1_result_ = (T*)(total_rows_before_expert_ + padded_experts);
+  if (has_fc3_) {
+    fc3_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+    fc1_result_ = reinterpret_cast<T*>(fc3_result_ + interbuf_size);
+  } else {
+    fc1_result_ = reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+  }
 
   const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
   if (!is_pow_2 || num_experts > 256) {
-    softmax_out_ = (T*)(fc1_result_ + interbuf_size);
+    softmax_out_ = reinterpret_cast<T*>(fc1_result_ + interbuf_size);
   } else {
     softmax_out_ = nullptr;
   }
 }
 
+namespace {
+
+struct __align__(8) Half4 {
+  half2 x;
+  half2 y;
+};
+
+// TODO(wy): move to common header
+template <typename T>
+struct T4;
+template <>
+struct T4<float> {
+  using Type = float4;
+};
+template <>
+struct T4<half> {
+  using Type = Half4;
+};
+
+template <typename T>
+struct T2;
+template <>
+struct T2<float> {
+  using Type = float2;
+};
+template <>
+struct T2<half> {
+  using Type = half2;
+};
+
+inline __device__ float2 operator*(const float2 a, const float2 b) {
+  return make_float2(a.x * b.x, a.y * b.y);
+}
+
+inline __device__ float4 operator*(const float4 a, const float4 b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+inline __device__ half operator*(const half a, const half b) {
+  return __float2half(__half2float(a) * __half2float(b));
+}
+
+inline __device__ half2 operator*(const half2 a, const half2 b) {
+  return make_half2(a.x * b.x, a.y * b.y);
+}
+#endif
+
+inline __device__ Half4 operator*(const Half4 a, const Half4 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+  Half4 result;
+  result.x = a.x * b.x;
+  result.y = a.y * b.y;
+  return result;
+#else
+  return Half4{__hmul2(a.x, b.x), __hmul2(a.y, b.y)};
+#endif
+}
+
+}  // anonymous namespace
+
+template <typename T>
+__global__ void elementWiseMulKernel(T* output, T const* input, size_t inter_size) {
+  int const tid = threadIdx.x;
+  int const token = blockIdx.x;
+
+  output = output + token * inter_size;
+  input = input + token * inter_size;
+  for (int i = tid; i < inter_size; i += blockDim.x) {
+    T fc1_value = input[i];
+    output[i] = fc1_value * output[i];
+  }
+}
+
+template <typename T>
+void elementWiseMul(T* output, T const* input, int inter_size, int num_tokens, cudaStream_t stream) {
+  int const blocks = num_tokens;
+
+  if (inter_size & 3 == 0) {
+    using vec_type = typename T4<T>::Type;
+    int const threads = std::min(inter_size / 4, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
+                                                                   reinterpret_cast<vec_type const*>(input),
+                                                                   inter_size / 4);
+  } else if (inter_size & 1 == 0) {
+    using vec_type = typename T2<T>::Type;
+    int const threads = std::min(inter_size / 2, 1024);
+    elementWiseMulKernel<vec_type><<<blocks, threads, 0, stream>>>(reinterpret_cast<vec_type*>(output),
+                                                                   reinterpret_cast<vec_type const*>(input),
+                                                                   inter_size / 2);
+  } else {
+    int const threads = std::min(inter_size, 1024);
+    elementWiseMulKernel<T><<<blocks, threads, 0, stream>>>(output, input, inter_size);
+  }
+}
+
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result,
-    const bool* finished, int active_rows, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
-    int* expert_for_source_row, cudaStream_t stream) {
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows,
+    T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row,
+    cudaStream_t stream) {
   static constexpr bool scales_required =
       std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
 
-  if constexpr (scales_required) {
+  if (scales_required) {
     if (fc1_scales == nullptr) {
       ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for first matmul is a null pointer");
     } else if (fc2_scales == nullptr) {
@@ -613,9 +740,10 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     }
   }
 
-  configure_ws_ptrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts, k);
+  configure_ws_ptrs(workspace_ptr, static_cast<size_t>(num_rows), static_cast<size_t>(hidden_size),
+                    static_cast<size_t>(inter_size), static_cast<size_t>(num_experts), static_cast<size_t>(k));
   topk_gating_softmax_kernelLauncher<T>(gating_output, finished, expert_scales, softmax_out_, expert_for_source_row,
-                                        source_rows_, num_rows, num_experts, k, stream);
+                                        source_rows_, num_rows, num_experts, k, normalize_routing_weights_, stream);
 
   const int sorter_ws_size_bytes = static_cast<int>(pad_to_multiple_of_16(sorter_.getWorkspaceSize(k * num_rows)));
   sorter_.run((void*)fc1_result_, sorter_ws_size_bytes, expert_for_source_row, permuted_experts_, source_rows_,
@@ -634,15 +762,48 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
   }
 
   // expanded_active_expert_rows is not used
-  moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
-                                     fc1_expert_weights, fc1_scales, fc1_expert_biases,
-                                     fc1_result_ + total_past_rows_ * inter_size,
-                                     total_rows_before_expert_ + local_experts_start_index,
-                                     expanded_active_expert_rows, inter_size, hidden_size,
-                                     local_num_experts, fc1_activation_type, stream);
+  if (fc1_expert_biases != nullptr) {
+    moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size,
+                                       fc1_expert_weights, fc1_scales, fc1_expert_biases,
+                                       fc1_result_ + total_past_rows_ * inter_size,
+                                       total_rows_before_expert_ + local_experts_start_index,
+                                       expanded_active_expert_rows, inter_size, hidden_size,
+                                       local_num_experts, fc1_activation_type, stream);
+  } else {
+    moe_gemm_runner_.moe_gemm_act(permuted_data_ + total_past_rows_ * hidden_size,
+                                  fc1_expert_weights, fc1_scales,
+                                  fc1_result_ + total_past_rows_ * inter_size,
+                                  total_rows_before_expert_ + local_experts_start_index,
+                                  expanded_active_expert_rows, inter_size, hidden_size,
+                                  local_num_experts, fc1_activation_type, stream);
+  }
+
+  if (has_fc3_) {
+    if (scales_required) {
+      if (fc3_scales == nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for third matmul is a null pointer");
+      }
+    } else {
+      if (fc3_scales != nullptr) {
+        ORT_THROW("[FT Error][Run MoE FC] Scales are ignored for fp32/fp16/bf16 but received scale for FC3");
+      }
+    }
+    if (fc3_expert_weights == nullptr) {
+      ORT_THROW("[FT Error][Run MoE FC] FC3 weights are null");
+    }
+    moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size,
+                              fc3_expert_weights, fc3_scales, fc3_expert_biases,
+                              fc3_result_ + total_past_rows_ * inter_size,
+                              total_rows_before_expert_ + local_experts_start_index,
+                              expanded_active_expert_rows, inter_size, hidden_size,
+                              local_num_experts, stream);
+
+    elementWiseMul(fc1_result_ + total_past_rows_ * inter_size, fc3_result_ + total_past_rows_ * inter_size,
+                   static_cast<int>(inter_size), static_cast<int>(total_covered_rows_), stream);
+  }
 
   moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size,
-                            fc2_expert_weights, fc2_scales,
+                            fc2_expert_weights, fc2_scales, nullptr,
                             fc2_result + total_past_rows_ * hidden_size,
                             total_rows_before_expert_ + local_experts_start_index,
                             expanded_active_expert_rows, hidden_size, inter_size, local_num_experts, stream);
@@ -651,14 +812,16 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
 template <typename T, typename WeightType, typename Enable>
 void CutlassMoeFCRunner<T, WeightType, Enable>::run_moe_fc(
     const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales,
-    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights,
-    const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts,
-    int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
+    const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights,
+    const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales,
+    int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts,
+    int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales,
     int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) {
   run_moe_fc(input_activations, gating_output, fc1_expert_weights, fc1_scales, fc1_expert_biases, fc1_activation_type,
-             fc2_expert_weights, fc2_scales, num_rows, hidden_size, inter_size, num_experts, local_num_experts,
-             local_experts_start_index, k, workspace_ptr, fc2_result, nullptr, num_rows, expert_scales,
-             expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream);
+             fc3_expert_weights, fc3_scales, fc3_expert_biases, fc2_expert_weights, fc2_scales, num_rows, hidden_size,
+             inter_size, num_experts, local_num_experts, local_experts_start_index, k, workspace_ptr, fc2_result,
+             nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row,
+             stream);
 }
 
 template <typename T, typename WeightType, typename Enable>
@@ -811,9 +974,10 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T*
       const T* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols;
 
       const int expert_idx = expert_for_source_row[k_offset];
-      const T* bias_ptr = bias + expert_idx * cols;
+      const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
 
-      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + bias_ptr[tid]);
+      thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] +
+                                                   (bias_ptr ? bias_ptr[tid] : T(0)));
     }
     reduced_row_ptr[tid] = thread_output;
   }
@@ -866,9 +1030,9 @@ void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* red
 
 // ========================= TopK Softmax specializations ===========================
 template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int,
-                                                 int, int, cudaStream_t);
+                                                 int, int, bool, cudaStream_t);
 template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int,
-                                                 int, int, cudaStream_t);
+                                                 int, int, bool, cudaStream_t);
 
 // ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
index 5cc2a3f79f003..5eef6f95f4820 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h
@@ -24,6 +24,8 @@
 #include "core/common/common.h"
 #include "contrib_ops/cuda/bert/transformer_cuda_common.h"
 
+#include "cutlass/numeric_types.h"
+
 using namespace onnxruntime;
 
 namespace ort_fastertransformer {
@@ -107,12 +109,13 @@ template <typename T,          /*The type used for activations/scales/compute*/
           typename Enable = void>
 class CutlassMoeFCRunner {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k);
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, T* expert_scales, int* expanded_source_row_to_expanded_dest_row,
@@ -120,6 +123,7 @@ class CutlassMoeFCRunner {
 
   void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights,
                   const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type,
+                  const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases,
                   const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size,
                   int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k,
                   char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows, T* expert_scales,
@@ -135,7 +139,8 @@ class CutlassMoeFCRunner {
                            int64_t& total_covered_rows);
 
  private:
-  void configure_ws_ptrs(char* ws_ptr, int num_rows, int hidden_size, int inter_size, int num_experts, int k);
+  void configure_ws_ptrs(char* ws_ptr, size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts,
+                         size_t k);
 
  private:
   CubKeyValueSorter sorter_;
@@ -152,12 +157,17 @@ class CutlassMoeFCRunner {
   int64_t* total_rows_before_expert_;
 
   T* fc1_result_;
+  T* fc3_result_;
+
+  bool has_fc3_;
+  bool normalize_routing_weights_;
 
   // Cuda events
   contrib::cuda::AutoDestoryCudaEvent cuda_event_;
 
   int64_t total_past_rows_;
   int64_t total_covered_rows_;
+
   // TODO: use pinned memory
   std::vector<int64_t> total_rows_before_expert_host_;
 };
@@ -165,11 +175,11 @@ class CutlassMoeFCRunner {
 template <typename WeightType>
 class CutlassMoeFCRunner<float, WeightType, typename std::enable_if_t<!std::is_same<float, WeightType>::value>> {
  public:
-  CutlassMoeFCRunner(int sm_version);
+  CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights);
 
-  size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k) {
+  size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k) {
     return 0;
   }
 };
 
-}  // namespace ort_fastertransformer
\ No newline at end of file
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc
index 3f26a274109ad..b13aab959fc48 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe.cc
+++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc
@@ -39,13 +39,16 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* router_probs = context->Input<Tensor>(1);
   const Tensor* fc1_experts_weights = context->Input<Tensor>(2);
-  const Tensor* fc2_experts_weights = context->Input<Tensor>(3);
-  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
+  const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(3);
+  const Tensor* fc2_experts_weights = context->Input<Tensor>(4);
   const Tensor* fc2_experts_bias_optional = context->Input<Tensor>(5);
+  const Tensor* fc3_experts_weights_optional = context->Input<Tensor>(6);
+  const Tensor* fc3_experts_bias_optional = context->Input<Tensor>(7);
 
   MoEParameters moe_params;
-  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights,
-                                  fc1_experts_bias_optional, fc2_experts_bias_optional));
+  ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional,
+                                  fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional,
+                                  fc3_experts_bias_optional));
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   auto stream = context->GetComputeStream();
@@ -53,12 +56,14 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   auto& device_prop = GetDeviceProp();
   const int sm = device_prop.major * 10 + device_prop.minor;
 
-  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm);
+  ort_fastertransformer::CutlassMoeFCRunner<CudaT, CudaT> moe_runner(sm,
+                                                                     fc3_experts_weights_optional != nullptr,
+                                                                     normalize_routing_weights_);
 
   size_t ws_size =
-      moe_runner.getWorkspaceSize(static_cast<int>(moe_params.num_rows), static_cast<int>(moe_params.hidden_size),
-                                  static_cast<int>(moe_params.inter_size), static_cast<int>(moe_params.num_experts),
-                                  static_cast<int>(k_));
+      moe_runner.getWorkspaceSize(static_cast<size_t>(moe_params.num_rows), static_cast<size_t>(moe_params.hidden_size),
+                                  static_cast<size_t>(moe_params.inter_size),
+                                  static_cast<size_t>(moe_params.num_experts), static_cast<size_t>(k_));
   size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT);
   size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT);
   size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int);
@@ -77,26 +82,37 @@ Status MoE<T>::ComputeInternal(OpKernelContext* context) const {
   IAllocatorUniquePtr<void> expert_for_source_row =
       IAllocator::MakeUniquePtr<void>(allocator, expert_for_source_row_size, false, stream);
 
-  // fc1_scales and fc2_scales are used in quantized MoE
-  const CudaT* fc1_scales_ptr = nullptr;
-  const CudaT* fc2_scales_ptr = nullptr;
-
+  const CudaT* fc_scales_ptr = nullptr;
   moe_runner.run_moe_fc(reinterpret_cast<const CudaT*>(input->template Data<T>()),
                         reinterpret_cast<const CudaT*>(router_probs->template Data<T>()),
-                        reinterpret_cast<const CudaT*>(fc1_experts_weights->template Data<T>()),
-                        std::move(fc1_scales_ptr),
+                        reinterpret_cast<const CudaT*>(fc1_experts_weights->DataRaw()),
+                        fc_scales_ptr,
                         fc1_experts_bias_optional == nullptr
                             ? nullptr
                             : reinterpret_cast<const CudaT*>(fc1_experts_bias_optional->template Data<T>()),
-                        activation_type_, reinterpret_cast<const CudaT*>(fc2_experts_weights->template Data<T>()),
-                        std::move(fc2_scales_ptr), static_cast<int>(moe_params.num_rows),
-                        static_cast<int>(moe_params.hidden_size), static_cast<int>(moe_params.inter_size),
-                        static_cast<int>(moe_params.num_experts), static_cast<int>(moe_params.local_num_experts),
-                        0 /*local_experts_start_index_ used in sharded MoE*/, static_cast<int>(k_),
-                        reinterpret_cast<char*>(work_space.get()), reinterpret_cast<CudaT*>(fc2_output.get()),
+                        activation_type_,
+                        fc3_experts_weights_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_weights_optional->DataRaw()),
+                        fc_scales_ptr,
+                        fc3_experts_bias_optional == nullptr
+                            ? nullptr
+                            : reinterpret_cast<const CudaT*>(fc3_experts_bias_optional->template Data<T>()),
+                        reinterpret_cast<const CudaT*>(fc2_experts_weights->DataRaw()),
+                        fc_scales_ptr,
+                        static_cast<int>(moe_params.num_rows),
+                        static_cast<int>(moe_params.hidden_size),
+                        static_cast<int>(moe_params.inter_size),
+                        static_cast<int>(moe_params.num_experts),
+                        static_cast<int>(moe_params.local_num_experts),
+                        0 /*local_experts_start_index_ used in sharded MoE*/,
+                        static_cast<int>(k_),
+                        reinterpret_cast<char*>(work_space.get()),
+                        reinterpret_cast<CudaT*>(fc2_output.get()),
                         reinterpret_cast<CudaT*>(expert_scales.get()),
                         reinterpret_cast<int*>(expanded_source_row_to_expanded_dest_row.get()),
-                        reinterpret_cast<int*>(expert_for_source_row.get()), Stream(context));
+                        reinterpret_cast<int*>(expert_for_source_row.get()),
+                        Stream(context));
 
   Tensor* output = context->Output(0, input->Shape());
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
index f55a7cde2e208..84a5e8c7c120d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h
+++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h
@@ -13,16 +13,22 @@ namespace cuda {
 
 enum class MoEParallelType {
   None = 0,
-  ExpertSlicing = 1,
+  EP = 1,
+  TP = 2,
+  EPAndTP = 3,
 };
 
 struct MoEParameters {
+  MoEParameters() {}
+  explicit MoEParameters(int64_t tensor_shards) : tensor_shards(tensor_shards) {}
   int64_t num_rows;
   int64_t num_experts;
   int64_t local_num_experts;
   int64_t hidden_size;
   int64_t inter_size;
+
   MoEParallelType parallel_type;
+  int64_t tensor_shards{1};
 };
 
 class MoEBase {
@@ -31,9 +37,11 @@ class MoEBase {
                      const Tensor* input,
                      const Tensor* router_probs,
                      const Tensor* fc1_experts_weights,
-                     const Tensor* fc2_experts_weights,
                      const Tensor* fc1_experts_bias_optional,
-                     const Tensor* fc2_experts_bias_optional) const {
+                     const Tensor* fc2_experts_weights,
+                     const Tensor* fc2_experts_bias_optional,
+                     const Tensor* fc3_experts_weights_optional,
+                     const Tensor* fc3_experts_bias_optional) const {
     const auto& input_dims = input->Shape().GetDims();
     const auto& router_probs_dims = router_probs->Shape().GetDims();
     const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
@@ -83,12 +91,6 @@ class MoEBase {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims[0] must be equal to num_rows, got ",
                              router_probs_dims[0], " and ", num_rows);
     }
-    if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional == nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is set but fc2_experts_bias is not set");
-    }
-    if (fc1_experts_bias_optional == nullptr && fc2_experts_bias_optional != nullptr) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is not set but fc2_experts_bias is set");
-    }
     if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional != nullptr) {
       const auto& fc1_experts_bias_dims = fc1_experts_bias_optional->Shape().GetDims();
       const auto& fc2_experts_bias_dims = fc2_experts_bias_optional->Shape().GetDims();
@@ -126,15 +128,38 @@ class MoEBase {
       }
     }
 
+    if (fc3_experts_weights_optional != nullptr &&
+        fc3_experts_weights_optional->Shape().GetDims() != fc1_experts_weights_dims) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_weights_dims must be equal to fc1_experts_weights_dims, got ",
+                             fc3_experts_weights_optional->Shape().GetDims(), " and ", fc1_experts_weights_dims);
+    }
+
+    if (fc3_experts_bias_optional != nullptr && fc1_experts_bias_optional != nullptr &&
+        fc3_experts_bias_optional->Shape().GetDims() != fc1_experts_bias_optional->Shape().GetDims()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ",
+                             fc3_experts_bias_optional->Shape().GetDims(), " and ",
+                             fc1_experts_bias_optional->Shape().GetDims());
+    }
+
     parameters.num_rows = num_rows;
     parameters.num_experts = num_experts;
     parameters.local_num_experts = local_num_experts;
     parameters.hidden_size = hidden_size;
     parameters.inter_size = inter_size;
     if (num_experts == local_num_experts) {
-      parameters.parallel_type = MoEParallelType::None;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::None;
+      } else {
+        parameters.parallel_type = MoEParallelType::TP;
+      }
     } else if (num_experts > local_num_experts) {
-      parameters.parallel_type = MoEParallelType::ExpertSlicing;
+      if (parameters.tensor_shards == 1) {
+        parameters.parallel_type = MoEParallelType::EP;
+      } else {
+        parameters.parallel_type = MoEParallelType::EPAndTP;
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "num_experts must be greater than or equal to local_num_experts, got ",
@@ -161,8 +186,11 @@ class MoEBase {
     } else {
       ORT_THROW("Unsupported MoE activation type: ", activation_type_str);
     }
+
+    normalize_routing_weights_ = op_kernel_info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
   }
 
+  bool normalize_routing_weights_;
   int64_t k_;
   ort_fastertransformer::ActivationType activation_type_;
 };
diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
index 382a3951f3a83..e19a976f3141c 100644
--- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc
@@ -151,7 +151,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather);
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll);
@@ -311,7 +311,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather)>,
 #endif
 
-#if defined(USE_MPI) && defined(ORT_USE_NCCL)
+#ifdef ORT_USE_NCCL
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll)>,
diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
index 4aa43f5de1cd5..a0ca2e45f153a 100644
--- a/onnxruntime/core/graph/contrib_ops/collective_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/collective_defs.cc
@@ -91,10 +91,18 @@ void RegisterCollectiveOps() {
             "Number of top experts to select from expert pool",
             AttributeProto::INT,
             static_cast<int64_t>(1))
+      .Attr("normalize_routing_weights",
+            "Whether to normalize routing weights",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
       .Attr("local_experts_start_index",
             "The start index of local experts",
             AttributeProto::INT,
-            static_cast<int64_t>(-1))
+            static_cast<int64_t>(0))
+      .Attr("tensor_shards",
+            "Tensor parallelism config. The number of shards for each expert weight and bias",
+            AttributeProto::INT,
+            static_cast<int64_t>(1))
       .Input(0,
              "input",
              "2D input tensor with shape (num_rows, hidden_size) or "
@@ -106,22 +114,32 @@ void RegisterCollectiveOps() {
              "T")
       .Input(2,
              "fc1_experts_weights",
-             "3D input tensor with shape (local_num_experts, hidden_size, inter_size)",
+             "3D input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
              "T")
       .Input(3,
-             "fc2_experts_weights",
-             "3D input tensor with shape (local_num_experts, inter_size, hidden_size)",
-             "T")
-      .Input(4,
              "fc1_experts_bias",
-             "2D optional input tensor with shape (local_num_experts, inter_size)",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
              "T",
              OpSchema::Optional)
+      .Input(4,
+             "fc2_experts_weights",
+             "3D input tensor with shape (local_num_experts, local_inter_size, hidden_size)",
+             "T")
       .Input(5,
              "fc2_experts_bias",
              "2D optional input tensor with shape (num_experts, hidden_size)",
              "T",
              OpSchema::Optional)
+      .Input(6,
+             "fc3_experts_weights",
+             "3D optional input tensor with shape (local_num_experts, hidden_size, local_inter_size)",
+             "T",
+             OpSchema::Optional)
+      .Input(7,
+             "fc3_experts_bias",
+             "2D optional input tensor with shape (local_num_experts, local_inter_size)",
+             "T",
+             OpSchema::Optional)
       .Output(0,
               "output",
               "2D input tensor with shape (num_rows, hidden_size) or "
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index bba1a073953e7..5fadce341b843 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1385,8 +1385,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1,
 
 constexpr const char* MoE_ver1_doc = R"DOC(
       Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1,
-      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
-      usually uses top 32 experts.
+      GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf)
+      usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral).
       )DOC";
 
 ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
@@ -1394,12 +1394,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1,
                                 .SetDoc(MoE_ver1_doc)
                                 .Attr("activation_type", "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu", AttributeProto::STRING, std::string("relu"))
                                 .Attr("k", "Number of top experts to select from expert pool", AttributeProto::INT, static_cast<int64_t>(1))
+                                .Attr("normalize_routing_weights", "Whether to normalize routing weights", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Input(0, "input", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T")
                                 .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size)", "T")
-                                .Input(3, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
-                                .Input(4, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(3, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
+                                .Input(4, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T")
                                 .Input(5, "fc2_experts_bias", "2D optional input tensor with shape (num_experts, hidden_size)", "T", OpSchema::Optional)
+                                .Input(6, "fc3_experts_weights", "3D optional input tensor with shape (num_experts, hidden_size, inter_size)", "T", OpSchema::Optional)
+                                .Input(7, "fc3_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional)
                                 .Output(0, "output", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or float16 tensors.")
                                 .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
index 8064bc0a58cb1..2913f4ac32b6e 100644
--- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
+++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -453,7 +453,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu
   int num_remaining_splits = 0;
   InlinedVector<int64_t> split_sizes;
   const bool is_string_type = input.IsDataTypeString();
-  const size_t element_size = (is_string_type) ? 0U : input.DataType()->Size();
+  const size_t element_size = input.DataType()->Size();
 
   // figure out split_scalar or split_sizes
   if (p_split_input) {
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index bed2f677166d6..052dd05574ab1 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include <vector>
 #include <mutex>
+#include <limits>
 #include <assert.h>
+#include <math.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 #include "core/providers/cuda/cuda_common.h"
@@ -345,9 +347,29 @@ __device__ __inline__ half _Pow(half a, half b) { return half(powf((float)a, (fl
 template <typename T>
 __device__ __inline__ T _Min(T a, T b) { return a < b ? a : b; }
 
+template <>
+__device__ __inline__ float _Min(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a < b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Min(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a < b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Max(T a, T b) { return a > b ? a : b; }
 
+template <>
+__device__ __inline__ float _Max(float a, float b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<float>::quiet_NaN() : ( a > b ? a : b );
+}
+
+template <>
+__device__ __inline__ double _Max(double a, double b) {
+  return (isnan(a) || isnan(b)) ? std::numeric_limits<double>::quiet_NaN() : ( a > b ? a : b );
+}
+
 template <typename T>
 __device__ __inline__ T _Abs(T a) { return a > (T)0 ? a : -a; }
 
@@ -543,7 +565,7 @@ struct _IsNan {
 template <>
 struct _IsNan<half> {
   __device__ __inline__ bool operator()(half a) const {
-    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask) 
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~MLFloat16::kSignMask)
            > MLFloat16::kPositiveInfinityBits;
   }
 };
@@ -551,7 +573,7 @@ struct _IsNan<half> {
 template <>
 struct _IsNan<BFloat16> {
   __device__ __inline__ bool operator()(BFloat16 a) const {
-    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask) 
+    return static_cast<uint16_t>(*reinterpret_cast<const uint16_t*>(&a) & ~BFloat16::kSignMask)
            > BFloat16::kPositiveInfinityBits;
   }
 };
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
index 8fdcaacdb0f29..7afd2d430ec46 100644
--- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -74,6 +74,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kM
                                                       MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, int8_t, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, uint8_t, MaxPool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float,
                                                       BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double,
@@ -165,6 +167,7 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
           kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
@@ -177,6 +180,10 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
                                                                   float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
                                                                   MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  int8_t, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  uint8_t, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
                                                                   float, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 39b73163794f0..9aa011c1d0ec4 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -37,13 +37,28 @@ Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dat
   TensorPitches pitches(input_dims);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> dims(rank);
   InlinedVector<int, kTensorShapeSmallBufferElementsSize> strides(rank);
-  for (int i = 0; i < rank; i++) {
-    dims[i] = gsl::narrow_cast<int>(input_dims[i]);
-    strides[i] = gsl::narrow_cast<int>(pitches[i]);
-  }
-  if (is_nhwc) {
-    std::swap(dims[1], dims[rank - 1]);
-    std::swap(strides[1], strides[rank - 1]);
+
+  if (!is_nhwc) {
+    for (int i = 0; i < rank; i++) {
+      dims[i] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i] = gsl::narrow_cast<int>(pitches[i]);
+    }
+  } else {
+    // NHWDC <-> NCHWD
+
+    // N
+    dims[0] = gsl::narrow_cast<int>(input_dims[0]);
+    strides[0] = gsl::narrow_cast<int>(pitches[0]);
+
+    // HWD
+    for (int i = 1; i < rank - 1; i++) {
+      dims[i + 1] = gsl::narrow_cast<int>(input_dims[i]);
+      strides[i + 1] = gsl::narrow_cast<int>(pitches[i]);
+    }
+
+    // C
+    dims[1] = gsl::narrow_cast<int>(input_dims[rank - 1]);
+    strides[1] = gsl::narrow_cast<int>(pitches[rank - 1]);
   }
   CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast<int>(rank), dims.data(), strides.data()));
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
index ef1155af127d1..9311f044f4ec5 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu
@@ -7,10 +7,11 @@
 
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/fast_divmod.h"
+#include "core/providers/cuda/shared_inc/cuda_utils.h"
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 __global__ void MaxPoolWithIndexKernel(
     int64_t batch,
     int64_t channels,
@@ -44,11 +45,27 @@ __global__ void MaxPoolWithIndexKernel(
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id >= output_size) return;
 
+  auto compute_offset =
+    [height, width, depth, channels](int n_index, int c_index, int h_index, int w_index, int d_index) -> int64_t {
+    if constexpr (Layout == LAYOUT_NCHW) {
+      return (((n_index * channels + c_index) * height + h_index) * width + w_index) * depth + d_index;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      return (((n_index * height + h_index) * width + w_index) * depth + d_index) * channels + c_index;
+    }
+  };
+
   int d_index, w_index, h_index, c_index, n_index, id_tmp;
-  fdm_d.divmod(id, id_tmp, d_index);
-  fdm_w.divmod(id_tmp, id_tmp, w_index);
-  fdm_h.divmod(id_tmp, id_tmp, h_index);
-  fdm_c.divmod(id_tmp, n_index, c_index);
+  if constexpr (Layout == LAYOUT_NCHW) {
+    fdm_d.divmod(id, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, id_tmp, h_index);
+    fdm_c.divmod(id_tmp, n_index, c_index);
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    fdm_c.divmod(id, id_tmp, c_index);
+    fdm_d.divmod(id_tmp, id_tmp, d_index);
+    fdm_w.divmod(id_tmp, id_tmp, w_index);
+    fdm_h.divmod(id_tmp, n_index, h_index);
+  }
 
   int64_t d_start = d_index * stride_d - pad_d;
   int64_t w_start = w_index * stride_w - pad_w;
@@ -64,29 +81,45 @@ __global__ void MaxPoolWithIndexKernel(
   int64_t d_index_max = -1;
   int64_t w_index_max = -1;
   int64_t h_index_max = -1;
-  int64_t offset = (n_index * channels + c_index) * height * width * depth;
+  int64_t offset = compute_offset(n_index, c_index, 0, 0, 0);
   const T* p_slice = p_input + offset;
-  T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1;
+  T maxval = p_slice[compute_offset(0, 0, h_start, w_start, d_start)] - (T)1;
   for (int64_t d = d_start; d < d_end; d += dilation_d) {
     for (int64_t w = w_start; w < w_end; w += dilation_w) {
       for (int64_t h = h_start; h < h_end; h += dilation_h) {
-        if (p_slice[h * width * depth + w * depth + d] > maxval) {
+        auto pool_offset = compute_offset(0, 0, h, w, d);
+        if (p_slice[pool_offset] > maxval) {
           h_index_max = h;
           w_index_max = w;
           d_index_max = d;
-          maxval = static_cast<float>(p_slice[h * width * depth + w * depth + d]);
+          maxval = static_cast<float>(p_slice[pool_offset]);
         }
       }
     }
   }
-  p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max];
+  p_output[id] = p_input[offset + compute_offset(0, 0, h_index_max, w_index_max, d_index_max)];
+
   if (p_indices) {
-    p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
-                                       : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
+                                         : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      // The tests currently have to be provided in NHWC layout so that tests do not fail. When converting between
+      // layouts, does it make sense to do an index conversion as well?
+      // Storing indices in NHWC layout isn't critical as they are supposed to be used by Unpooling operations
+      // which currently assume that indices reference to Tensors in NHWC layout.
+      int64_t id_nchw = 
+        (((n_index * channels + c_index) * pooled_height + h_index) * pooled_width + w_index) * pooled_depth + d_index;
+      int64_t offset_nchw = (n_index * channels + c_index) * width * height * depth;
+
+      p_indices[id_nchw] = (storage_order == 0)
+                               ? offset_nchw + h_index_max * width * depth + w_index_max * depth + d_index_max
+                               : offset_nchw + h_index_max + w_index_max * height + d_index_max * width * height;
+    }
   }
 }
 
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
@@ -99,14 +132,29 @@ void MaxPoolWithIndex(
     const T* p_input,
     T* p_output,
     int64_t* p_indices) {
-  int64_t batchs = input_shape[0];
-  int64_t channels = input_shape[1];
-  int64_t height = input_shape[2];
-  int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1;
-  int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
-  int64_t pooled_height = output_shape[2];
-  int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
-  int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  int64_t batchs, channels, height, width, depth;
+  int64_t pooled_height, pooled_width, pooled_depth;
+  if constexpr (Layout == LAYOUT_NCHW) {
+    batchs = input_shape[0];
+    channels = input_shape[1];
+    height = input_shape[2];
+    width = kernel_shape.size() > 1 ? input_shape[3] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
+
+    pooled_height = output_shape[2];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  } else if constexpr (Layout == LAYOUT_NHWC) {
+    batchs = input_shape[0];
+    height = input_shape[1];
+    width = kernel_shape.size() > 1 ? input_shape[2] : 1;
+    depth = kernel_shape.size() > 2 ? input_shape[3] : 1;
+    channels = input_shape[input_shape.NumDimensions() - 1];
+
+    pooled_height = output_shape[1];
+    pooled_width = kernel_shape.size() > 1 ? output_shape[2] : 1;
+    pooled_depth = kernel_shape.size() > 2 ? output_shape[3] : 1;
+  }
   int64_t kernel_h = kernel_shape[0];
   int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1;
   int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1;
@@ -130,7 +178,7 @@ void MaxPoolWithIndex(
   fast_divmod fdm_d(static_cast<int>(pooled_depth));
 
   int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
-  MaxPoolWithIndexKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+  MaxPoolWithIndexKernel<T, Layout><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       batchs,
       channels,
       height,
@@ -162,8 +210,8 @@ void MaxPoolWithIndex(
       p_indices);
 }
 
-#define INSTANTIATEMAXPOOLWITHINDEX(T)              \
-  template void MaxPoolWithIndex<T>(                \
+#define INSTANTIATEMAXPOOLWITHINDEX(T, Layout)      \
+  template void MaxPoolWithIndex<T, Layout>(        \
       cudaStream_t stream,                          \
       const TensorShape& input_shape,               \
       const TensorShape& output_shape,              \
@@ -176,11 +224,19 @@ void MaxPoolWithIndex(
       T* p_output,                                  \
       int64_t* p_indices);
 
-INSTANTIATEMAXPOOLWITHINDEX(float)
-INSTANTIATEMAXPOOLWITHINDEX(double)
-INSTANTIATEMAXPOOLWITHINDEX(half)
-INSTANTIATEMAXPOOLWITHINDEX(int8_t)
-INSTANTIATEMAXPOOLWITHINDEX(uint8_t)
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NCHW)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NCHW)
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NHWC)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NHWC)
+#endif
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
index 27f5b241cc785..98f14c3f6a626 100644
--- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
+++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h
@@ -7,7 +7,7 @@
 
 namespace onnxruntime {
 namespace cuda {
-template <typename T>
+template <typename T, bool Layout>
 void MaxPoolWithIndex(
     cudaStream_t stream,
     const TensorShape& input_shape,
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index 8bc96958693bc..4acdcfcf35491 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -87,6 +87,8 @@ POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInt
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
 
 POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
 POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
@@ -145,8 +147,8 @@ class CudnnPoolingDescriptor final {
   cudnnPoolingDescriptor_t desc_;
 };
 
-template <typename T, typename PoolType, bool NHWC>
-Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, typename PoolType, bool Layout>
+Status Pool<T, PoolType, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -157,16 +159,21 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   }
 
   auto kernel_shape = pool_attrs_.kernel_shape;
-  auto pads = pool_attrs_.pads;
   auto strides = pool_attrs_.strides;
+  TensorShapeVector pads = pool_attrs_.pads;
 
   if (pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = (Layout == LAYOUT_NHWC) ? x_shape[x_dims.size() - 1] : x_shape[1];
+
+  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   TensorShape y_shape(y_dims);
   Tensor* Y = context->Output(0, y_shape);
   // special case when there is a dim value of 0 in the shape.
@@ -178,20 +185,22 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   TensorShapeVector x_dims_cudnn(x_dims.begin(), x_dims.end());
   TensorShapeVector y_dims_cudnn(y_dims);
   if (kernel_shape.size() < 2) {
-    // cudnn only takes 4D or 5D input, so pad dimensions if needed
-    if (NHWC) {
-      x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1);
-      y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1);
-      kernel_shape.insert(kernel_shape.begin() + 1, 1);
-      strides.insert(strides.begin() + 1, 1);
-    } else {
-      x_dims_cudnn.push_back(1);
-      y_dims_cudnn.push_back(1);
-      kernel_shape.push_back(1);
-      strides.push_back(1);
+    // cuDNN only takes 4D or 5D input, so pad dimensions if needed
+    if constexpr (Layout == LAYOUT_NHWC) {
+      x_dims_cudnn.insert(x_dims_cudnn.end() - 1, 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end() - 1, 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
+    } else {  // Layout == LAYOUT_NCHW
+      x_dims_cudnn.insert(x_dims_cudnn.end(), 1);
+      y_dims_cudnn.insert(y_dims_cudnn.end(), 1);
+      pads.insert(pads.begin() + pads.size() / 2, 0);
+      pads.insert(pads.end(), 0);
+      kernel_shape.insert(kernel_shape.end(), 1);
+      strides.insert(strides.end(), 1);
     }
-    pads.insert(pads.begin() + kernel_shape.size(), 0);
-    pads.insert(pads.end(), 0);
   }
 
   cudnnPoolingMode_t mode = CUDNN_POOLING_MAX;
@@ -208,8 +217,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<float>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), Layout == LAYOUT_NHWC));
 
     const auto input_count = x_shape.Size();
     const auto output_count = y_shape.Size();
@@ -225,8 +234,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
     const auto beta = Consts<CudaT>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), Layout == LAYOUT_NHWC));
 
     CUDNN_RETURN_IF_ERROR(
         PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data));
@@ -235,8 +244,8 @@ Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const
   return Status::OK();
 }
 
-template <typename T, bool NHWC>
-Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool Layout>
+Status Pool<T, MaxPool<8>, Layout>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -251,12 +260,16 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto strides = this->pool_attrs_.strides;
 
   if (this->pool_attrs_.global_pooling) {
-    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
-    pads.assign(kernel_shape.size(), 0);
+    if constexpr (Layout == LAYOUT_NCHW) {
+      kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    } else if constexpr (Layout == LAYOUT_NHWC) {
+      kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1);
+    }
+    pads.assign(2 * kernel_shape.size(), 0);  // x{i}_begin + x{i}_end
     strides.assign(kernel_shape.size(), 1);
   }
-  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
-  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
+  auto out_channel = Layout == LAYOUT_NHWC ? x_shape[x_shape.NumDimensions() - 1] : x_shape[1];
+  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC);
   Tensor* Y = context->Output(0, TensorShape(y_dims));
 
   // special case when there is a dim value of 0 in the shape.
@@ -265,13 +278,22 @@ Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) cons
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
 
-  Tensor* I = context->Output(1, TensorShape(y_dims));
+  // I is in NCHW format and the contained indices use NCHW math to compute the index
+  auto i_dims = y_dims;
+  if constexpr (Layout == LAYOUT_NHWC) {
+    // y_dims in NHWDC format, i_dims has to be in NCHWD format.
+    i_dims.insert(i_dims.begin() + 1, i_dims.back());  // N*C*HWDC
+    i_dims.pop_back();                                 // NCHW
+  }
+
+  Tensor* I = context->Output(1, TensorShape(i_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<CudaT>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads,
-                            this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data);
+    MaxPoolWithIndex<CudaT, Layout == LAYOUT_NHWC>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape,
+                                                   strides, pads, this->pool_attrs_.dilations,
+                                                   this->pool_attrs_.storage_order, x_data, y_data, i_data);
   } else {
-    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, NHWC>::ComputeInternal(context)));
+    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, Layout == LAYOUT_NHWC>::ComputeInternal(context)));
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h
index 8b5152a1565a9..97f7c8b8762d5 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.h
+++ b/onnxruntime/core/providers/cuda/nn/pool.h
@@ -19,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
-template <typename T, bool NHWC>
-class Pool<T, MaxPool<8>, NHWC> final : public Pool<T, MaxPool<1>, NHWC> {
+template <typename T, bool Layout>
+class Pool<T, MaxPool<8>, Layout> final : public Pool<T, MaxPool<1>, Layout> {
  public:
-  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
+  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, Layout>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override;
 };
diff --git a/onnxruntime/core/providers/js/operators/where.cc b/onnxruntime/core/providers/js/operators/where.cc
index 2f8f5e275aa98..dcdf9bee2f783 100644
--- a/onnxruntime/core/providers/js/operators/where.cc
+++ b/onnxruntime/core/providers/js/operators/where.cc
@@ -6,18 +6,19 @@
 namespace onnxruntime {
 namespace js {
 
-#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)      \
-  ONNX_OPERATOR_KERNEL_EX(                                          \
-      OP_TYPE,                                                      \
-      kOnnxDomain,                                                  \
-      VERSION,                                                      \
-      kJsExecutionProvider,                                         \
-      KernelDefBuilder()                                            \
-          .TypeConstraint("T",                                      \
-                          {DataTypeImpl::GetTensorType<float>(),    \
-                           DataTypeImpl::GetTensorType<int32_t>(),  \
-                           DataTypeImpl::GetTensorType<uint32_t>(), \
-                           DataTypeImpl::GetTensorType<bool>()}),   \
+#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS)       \
+  ONNX_OPERATOR_KERNEL_EX(                                           \
+      OP_TYPE,                                                       \
+      kOnnxDomain,                                                   \
+      VERSION,                                                       \
+      kJsExecutionProvider,                                          \
+      KernelDefBuilder()                                             \
+          .TypeConstraint("T",                                       \
+                          {DataTypeImpl::GetTensorType<float>(),     \
+                           DataTypeImpl::GetTensorType<MLFloat16>(), \
+                           DataTypeImpl::GetTensorType<int32_t>(),   \
+                           DataTypeImpl::GetTensorType<uint32_t>(),  \
+                           DataTypeImpl::GetTensorType<bool>()}),    \
       KERNEL_CLASS);
 
 #define REG_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS) \
@@ -29,6 +30,7 @@ namespace js {
       KernelDefBuilder()                                                                  \
           .TypeConstraint("T",                                                            \
                           {DataTypeImpl::GetTensorType<float>(),                          \
+                           DataTypeImpl::GetTensorType<MLFloat16>(),                      \
                            DataTypeImpl::GetTensorType<int32_t>(),                        \
                            DataTypeImpl::GetTensorType<uint32_t>(),                       \
                            DataTypeImpl::GetTensorType<bool>()}),                         \
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index 330b464ffd1bb..3252603e33389 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -1,8 +1,9 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <fstream>
 #include <utility>
+#include <exception>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "contexts.h"
@@ -24,15 +25,6 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   global_context_ = global_context;
 
   auto prec_str = GetGlobalContext().precision_str;
-  if (prec_str == "FP32") {
-    subgraph_context_.precision = "FP32";
-  } else if (prec_str == "FP16") {
-    subgraph_context_.precision = "FP16";
-  } else if (prec_str == "U8") {
-    subgraph_context_.precision = "U8";
-  } else {
-    throw std::string("Invalid OpenVINO Precision type: " + prec_str);
-  }
 
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
@@ -47,7 +39,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   for (auto input : graph_inputs) {
     auto it = subgraph_context_.input_names.find(input->Name());
     if (it == subgraph_context_.input_names.end()) {
-      throw std::string("Input not found in the input defs list");
+      ORT_THROW("Input not found in the input defs list");
     }
     int index = it->second;
     subgraph_context_.input_indexes.push_back(index);
@@ -61,6 +53,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   }
   subgraph_context_.subgraph_name = fused_node.Name();
   model_proto_ = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
+  std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type;
 
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
@@ -75,7 +68,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                                                           GetGlobalContext(),
                                                           subgraph_context_);
         } catch (std::string const& msg) {
-          throw msg;
+          ORT_THROW(msg);
         }
         LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                            << "Backend created for graph " << subgraph_context_.subgraph_name;
@@ -87,12 +80,29 @@ BackendManager::BackendManager(const GlobalContext& global_context,
                        << subgraph_context_.subgraph_name;
 
     subgraph_context_.has_dynamic_input_shape = false;
+
+    // OV NPU plugin is supported with fallback to OV CPU upon compilation failures.
     try {
       concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const OnnxRuntimeException& ex) {
+      if (device_type.find("NPU") != std::string::npos) {
+        LOGS_DEFAULT(WARNING) << ex.what();
+        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                              << "Falling back to OV CPU for execution";
+        GetGlobalContext().device_type = "CPU";
+        GetGlobalContext().precision_str = "FP32";
+        try {
+          concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+        } catch (std::string const& msg) {
+          ORT_THROW(msg);
+        }
+      } else {
+        ORT_THROW(ex.what());
+      }
     }
   }
 }
@@ -254,8 +264,13 @@ void BackendManager::Compute(OrtKernelContext* context) {
     LOGS_DEFAULT(INFO) << "Start Compute";
   }
 #endif
+  // OV NPU doesn't support dynamic shaped model inference.
+  // if disable_dynamic_shapes is set to true then execution of dynamic model is done
+  // by rewriting the model to static shaped model at runtime based on input shape.
+  // disable_dynamic_shapes is always set to true for OV NPU plugin.
   bool use_dynamic_backend = true;
-  if (!GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape &&
+  if (subgraph_context_.has_dynamic_input_shape &&
+      !GetGlobalContext().disable_dynamic_shapes &&
       (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
        GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
@@ -263,12 +278,11 @@ void BackendManager::Compute(OrtKernelContext* context) {
   } else if (use_dynamic_backend && subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
     auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
-
     std::shared_ptr<IBackend> dynamic_backend;
     auto search = backend_map_.find(key);
     if (search == backend_map_.end()) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
-                         << "Creating concrete backend for key: " << key;
+                         << "Creating dynamic backend for key: " << key;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Backend created for graph " << subgraph_context_.subgraph_name;
       auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes);
@@ -276,8 +290,22 @@ void BackendManager::Compute(OrtKernelContext* context) {
         dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
                                                       GetGlobalContext(),
                                                       subgraph_context_);
-      } catch (std::string const& msg) {
-        throw msg;
+      } catch (const OnnxRuntimeException& ex) {
+        if (GetGlobalContext().device_type.find("NPU") != std::string::npos) {
+          LOGS_DEFAULT(WARNING) << ex.what();
+          LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                                << "Falling back to OV CPU for execution";
+          GetGlobalContext().device_type = "CPU";
+          GetGlobalContext().precision_str = "FP32";
+          key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type);
+          try {
+            dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes,
+                                                          GetGlobalContext(),
+                                                          subgraph_context_);
+          } catch (std::string const& msg) {
+            ORT_THROW(msg);
+          }
+        }
       }
       backend_map_.insert({key, dynamic_backend});
     } else {
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index 59bda7ca640ee..376ebea225a2b 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 50c839017df2a..32b5ad7d5b66d 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <algorithm>
@@ -11,12 +11,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -47,7 +42,6 @@ struct static_cast_int64 {
 
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
   if (IsCILogEnabled()) {
     std::cout << "CreateNgraphFunc" << std::endl;
@@ -55,28 +49,6 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
   const std::string model = model_proto.SerializeAsString();
   try {
     auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name);
-    if ((subgraph_context.precision == "FP16") &&
-        (global_context.device_type.find("NPU") == std::string::npos)) {
-      // FP16 transformations
-      ov::pass::ConvertFP32ToFP16 pass_obj;
-      pass_obj.run_on_model(cnn_network);
-      cnn_network->validate_nodes_and_infer_types();
-
-      auto proc = ov::preprocess::PrePostProcessor(cnn_network);
-      for (size_t i = 0; i < cnn_network->inputs().size(); i++) {
-        if (cnn_network->inputs()[i].get_element_type() == ov::element::f16) {
-          proc.input(i).tensor().set_element_type(ov::element::f32);
-          proc.input(i).preprocess().convert_element_type(ov::element::f16);
-        }
-      }
-
-      for (size_t i = 0; i < cnn_network->outputs().size(); i++) {
-        if (cnn_network->outputs()[i].get_element_type() == ov::element::f16) {
-          proc.output(i).postprocess().convert_element_type(ov::element::f32);
-        }
-      }
-      cnn_network = proc.build();
-    }
 
     // Check for Constant Folding
     if (!global_context.is_wholly_supported_graph) {
@@ -103,7 +75,7 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
 #endif
     return cnn_network;
   } catch (std::string const& msg) {
-    throw msg;
+    ORT_THROW(msg);
   }
 }
 
@@ -127,7 +99,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
   }
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   return context.GetOutput(index, output_shape.get(), num_dims);
@@ -145,7 +117,7 @@ GetOutputTensor(Ort::KernelContext& context,
 
   auto it = output_names.find(output_name);
   if (it == output_names.end()) {
-    throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX");
+    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
   auto shape = node->get_shape();
@@ -204,7 +176,7 @@ void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedVal
       break;
     }
     default:
-      throw std::string(log_tag + "Unsupported output data type");
+      ORT_THROW(log_tag + "Unsupported output data type");
   }
 }
 
@@ -232,7 +204,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
   auto tensor = context.GetInput(subgraph_context.input_names.at(input_name));
   auto mem_info = tensor.GetTensorMemoryInfo();
   if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-    throw std::string(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
+    ORT_THROW(log_tag + "IO Buffering is not enabled, Please enable Input on CPU");
   }
   // Copy input data into OpenVINO's input buffer
   const char* tensor_data = tensor.GetTensorData<char>();
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 82b0351e87da5..93fa874774469 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -65,7 +65,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
 std::shared_ptr<OVNetwork>
 CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto,
               const GlobalContext& global_context,
-              const SubGraphContext& subgraph_context,
               std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);
 
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
index c586dd8b38af9..a0f4ce8f843b0 100644
--- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc
+++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <memory>
@@ -24,11 +24,11 @@ BackendFactory::MakeBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
     try {
       concrete_backend_ = std::make_shared<BasicBackend>(model_proto, global_context, subgraph_context);
     } catch (std::string const& msg) {
-      throw msg;
+      ORT_THROW(msg);
     }
     return concrete_backend_;
   } else {
-    throw std::string("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
+    ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type);
   }
 }
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 0779940983aea..69d234a7c55ef 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <map>
@@ -79,20 +79,20 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
                                                            subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       } else {
-        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+        ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
         exe_network_ = global_context_.ie_core.LoadNetwork(
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
 #endif
     } else {
-      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
+      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_);
       exe_network_ = global_context_.ie_core.LoadNetwork(
           ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
       LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 
   inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, 1));
@@ -125,21 +125,17 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (global_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
+
+    const std::string env_npu_compiler_type = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_NPU_COMPILER_TYPE");
+    if (!env_npu_compiler_type.empty()) {
+      device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type);
+    }
     device_config.emplace(ov::device::properties("NPU", device_property));
   }
 }
 
 void BasicBackend::EnableCaching() {
   if (!global_context_.cache_dir.empty()) {
-    if (global_context_.is_wholly_supported_graph) {
-#if defined(OPENVINO_2022_3)
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
-      _putenv_s("OV_GPU_CACHE_MODEL", "1");
-#else
-      setenv("OV_GPU_CACHE_MODEL", "1", 1);
-#endif
-#endif
-    }
     LOGS_DEFAULT(INFO) << log_tag << "Enables Caching";
     global_context_.ie_core.SetCache(global_context_.cache_dir);
   }
@@ -162,7 +158,7 @@ void BasicBackend::EnableStreams() {
       (global_context_.device_type.find("HETERO") != std::string::npos) ||
       (global_context_.device_type.find("AUTO") != std::string::npos)) {
     if (global_context_.num_streams != 1) {
-      throw(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
+      ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type);
     }
     // Do nothing
   } else {
@@ -198,9 +194,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
@@ -232,14 +228,14 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         try {
           infer_request->SetTensor(input_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       } else {
         OVTensorPtr graph_input_blob;
         try {
           graph_input_blob = infer_request->GetTensor(input_name);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
         FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
       }
@@ -248,7 +244,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -274,10 +270,10 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
       if (input_names.find(onnx_input_name) != input_names.end()) {
         input_name = onnx_input_name;
       } else {
-        throw(log_tag +
-              "Input names mismatch between OpenVINO and ONNX. " +
-              onnx_input_name +
-              " doesn't exist in the list of OpenVINO input tensor names");
+        ORT_THROW(log_tag +
+                  "Input names mismatch between OpenVINO and ONNX. " +
+                  onnx_input_name +
+                  " doesn't exist in the list of OpenVINO input tensor names");
       }
       input_idx++;
       // Kernel Context Input Buffer
@@ -322,7 +318,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         }
       }
       if (!output_name_found) {
-        throw std::string(
+        ORT_THROW(
             log_tag +
             "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
             onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
@@ -344,7 +340,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
         try {
           infer_request->SetTensor(output_name, tensor_ptr);
         } catch (const char* msg) {
-          throw(msg);
+          ORT_THROW(msg);
         }
       }
     }
@@ -352,7 +348,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 #endif
@@ -382,17 +378,18 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         }
       }
       if (!output_name_found) {
-        throw(log_tag +
-              "Output names mismatch between OpenVINO and ONNX. "
-              "[ONNX Output: ] " +
-              onnx_output_name +
-              " doesn't exist in the "
-              "list of OpenVINO output tensor names");
+        ORT_THROW(
+            log_tag +
+            "Output names mismatch between OpenVINO and ONNX. "
+            "[ONNX Output: ] " +
+            onnx_output_name +
+            " doesn't exist in the "
+            "list of OpenVINO output tensor names");
       }
       try {
         graph_output_blob = infer_request->GetTensor(output_name);
       } catch (const char* msg) {
-        throw(msg);
+        ORT_THROW(msg);
       }
       size_t batch_size = 1;
       auto output_tensor =
@@ -413,14 +410,14 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         auto mem_info = output_tensor.GetTensorMemoryInfo();
         if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-          throw(log_tag + "IO Buffering is not supported for constant subgraphs");
+          ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs");
         } else {
           FillOutputsWithConstantData(node, output_tensor);
         }
       }
     }
   } catch (const char* msg) {
-    throw(msg);
+    ORT_THROW(msg);
   }
 }
 
@@ -440,7 +437,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
         auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
         FillOutputsWithConstantData(node, output_tensor);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
     // Get Output tensors
@@ -461,26 +458,26 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     } else {
       try {
         StartAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
-        throw msg;
+        ORT_THROW(msg);
       }
     }
 #else
     try {
       StartAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what());
     }
 #endif
     try {
       CompleteAsyncInference(context, infer_request);
-    } catch (std::string const& msg) {
-      throw msg;
+    } catch (const std::runtime_error& e) {
+      ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what());
     }
 
     // Get Output tensors
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
index aa96dadbf0e2d..3502f660bbb20 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.h
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 5f19c71683f24..8701d9f676ffd 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -31,6 +31,7 @@ struct GlobalContext {
   int onnx_opset_version;
   void* context = 0;
   bool use_api_2;
+  std::vector<int> OpenVINO_Version = {};  // Ov Major and OV minor version from OV headers
 };
 
 // Holds context specific to subgraph.
@@ -44,7 +45,6 @@ struct SubGraphContext {
   std::vector<int> input_indexes;
   std::unordered_map<std::string, int> input_names;
   std::unordered_map<std::string, int> output_names;
-  std::string precision;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index 8aacce19c14d5..ece855c6167c6 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index e3948cc94b348..913440d2fb6ea 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -6,6 +6,7 @@
 #include "contexts.h"
 #include "backend_manager.h"
 #include "ov_versions/capability.h"
+#include "openvino/core/version.hpp"
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -25,6 +26,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
   global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
   global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
   global_context_->num_of_threads = info.num_of_threads_;
+  global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
@@ -50,8 +52,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
               device_found = true;
               break;
             }
-            if ((info.device_type_.find("NPU") != std::string::npos) &&
-                (info.precision_ == "FP16" || info.precision_ == "U8")) {
+            if (info.device_type_.find("NPU") != std::string::npos) {
               device_found = true;
               break;
             }
@@ -113,27 +114,10 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   global_context_->onnx_opset_version =
       graph_viewer.DomainToVersionMap().at(kOnnxDomain);
 
-#if defined(OPENVINO_2023_0)
   openvino_ep::GetCapability obj(graph_viewer,
                                  global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_0");
+                                 global_context_->precision_str);
   result = obj.Execute();
-#elif defined(OPENVINO_2023_1)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_1");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_2)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_2");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_3)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type,
-                                 global_context_->precision_str, "V_2023_3");
-  result = obj.Execute();
-#endif
 
   global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index b0c92828d8a38..b0dc881c36f33 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -20,7 +20,7 @@ static void print_build_options() {
             << "you want to build"
             << std::endl;
   std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build "
-            << "are ['CPU','GPU']"
+            << "are ['CPU','GPU','NPU']"
             << std::endl;
   std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. "
             << "Ex: HETERO:GPU,CPU  Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU"
@@ -48,7 +48,7 @@ static std::vector<std::string> parseDevices(const std::string& device_string) {
     print_build_options();
     ORT_THROW("Invalid device string: " + device_string);
   }
-  std::vector<std::string> dev_options = {"CPU", "GPU"};
+  std::vector<std::string> dev_options = {"CPU", "GPU", "NPU"};
   for (std::string dev : devices) {
     if (!std::count(dev_options.begin(), dev_options.end(), dev)) {
       print_build_options();
@@ -98,12 +98,9 @@ struct OpenVINOExecutionProviderInfo {
 #elif defined OPENVINO_CONFIG_GPU_FP16
       device_type_ = "GPU";
       precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_FP16
+#elif defined OPENVINO_CONFIG_NPU
       device_type_ = "NPU";
-      precision_ = "FP16";
-#elif defined OPENVINO_CONFIG_NPU_U8
-      device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
 #elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO
 #ifdef DEVICE_NAME
 #define DEVICE DEVICE_NAME
@@ -142,12 +139,9 @@ struct OpenVINOExecutionProviderInfo {
     } else if (dev_type == "GPU.1_FP16") {
       device_type_ = "GPU.1";
       precision_ = "FP16";
-    } else if (dev_type == "NPU_FP16") {
-      device_type_ = "NPU";
-      precision_ = "FP16";
-    } else if (dev_type == "NPU_U8") {
+    } else if (dev_type == "NPU") {
       device_type_ = "NPU";
-      precision_ = "U8";
+      precision_ = "";
     } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0) {
       std::vector<std::string> devices = parseDevices(dev_type);
       precision_ = "FP16";
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 068456777bece..17511c54aab86 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -78,7 +78,6 @@ struct OpenVINO_Provider : Provider {
                                             // with this value at runtime.
     bool enable_opencl_throttling = false;  // [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU
                                             // device (Reduces CPU Utilization when using GPU)
-    bool disable_dynamic_shapes = false;    // [disable_dynamic_shapes]:  Execute model with default static shape for optimal performance.
     void* context = nullptr;
 
     if (provider_options_map.find("device_type") != provider_options_map.end()) {
@@ -86,7 +85,7 @@ struct OpenVINO_Provider : Provider {
 
       std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                          "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                         "GPU.0_FP16", "GPU.1_FP16"};
+                                                         "GPU.0_FP16", "GPU.1_FP16", "NPU"};
       if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) ||
             (device_type.find("HETERO:") == 0) ||
             (device_type.find("MULTI:") == 0) ||
@@ -94,7 +93,7 @@ struct OpenVINO_Provider : Provider {
         ORT_THROW(
             "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. "
             "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-            "'GPU.0_FP16', 'GPU.1_FP16' or from"
+            "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
             " HETERO/MULTI/AUTO options available. \n");
       }
     }
@@ -147,12 +146,24 @@ struct OpenVINO_Provider : Provider {
       bool_flag = "";
     }
 
+    // [disable_dynamic_shapes]:  Rewrite dynamic shaped models to static shape at runtime and execute.
+    // Always true for NPU plugin.
+    bool disable_dynamic_shapes = false;
+    if (device_type.find("NPU") != std::string::npos) {
+      disable_dynamic_shapes = true;
+    }
     if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) {
       bool_flag = provider_options_map.at("disable_dynamic_shapes");
       if (bool_flag == "true" || bool_flag == "True")
         disable_dynamic_shapes = true;
-      else if (bool_flag == "false" || bool_flag == "False")
-        disable_dynamic_shapes = false;
+      else if (bool_flag == "false" || bool_flag == "False") {
+        if (device_type.find("NPU") != std::string::npos) {
+          disable_dynamic_shapes = true;
+          LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to TRUE for NPU backend.\n ";
+        } else {
+          disable_dynamic_shapes = false;
+        }
+      }
     }
     return std::make_shared<OpenVINOProviderFactory>(const_cast<char*>(device_type.c_str()),
                                                      enable_npu_fast_compile,
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index ea481791111fc..d7c6654c90f81 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "ov_interface.h"
@@ -8,12 +8,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "backend_utils.h"
 
-#if defined(OV_API_20)
 using Exception = ov::Exception;
-#else
-using Exception = InferenceEngine::details::InferenceEngineException;
-using WaitMode = InferenceEngine::IInferRequest::WaitMode;
-#endif
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -36,9 +31,9 @@ std::shared_ptr<OVNetwork> OVCore::ReadModel(const std::string& model, const std
     }
     return FE->convert(inputModel);
   } catch (const Exception& e) {
-    throw std::string(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
+    ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what()));
   } catch (...) {
-    throw std::string(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
+    ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network");
   }
 }
 
@@ -81,9 +76,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& ie_cnn_network,
     OVExeNetwork exe(obj);
     return exe;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 
@@ -113,9 +108,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& model, OVRemoteCont
     auto obj = oe.compile_model(model, *context);
     return OVExeNetwork(obj);
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what());
+    ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Exception while Loading Network for graph " + name);
+    ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
 #endif
@@ -135,9 +130,9 @@ OVInferRequest OVExeNetwork::CreateInferRequest() {
     OVInferRequest inf_obj(infReq);
     return inf_obj;
   } catch (const Exception& e) {
-    throw std::string(log_tag + "Exception while creating InferRequest object: " + e.what());
+    ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + "Exception while creating InferRequest object.");
+    ORT_THROW(log_tag + "Exception while creating InferRequest object.");
   }
 }
 
@@ -147,9 +142,9 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) {
     OVTensorPtr blob = std::make_shared<OVTensor>(tobj);
     return blob;
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name);
+    ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name);
   }
 }
 
@@ -157,9 +152,9 @@ void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Cannot set Remote Blob for output: " + name);
+    ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name);
   }
 }
 
@@ -167,9 +162,9 @@ void OVInferRequest::StartAsync() {
   try {
     ovInfReq.start_async();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -177,9 +172,9 @@ void OVInferRequest::Infer() {
   try {
     ovInfReq.infer();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Couldn't start Inference: " + e.what());
+    ORT_THROW(log_tag + " Couldn't start Inference: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " In Error Couldn't start Inference");
+    ORT_THROW(log_tag + " In Error Couldn't start Inference");
   }
 }
 
@@ -187,9 +182,9 @@ void OVInferRequest::WaitRequest() {
   try {
     ovInfReq.wait();
   } catch (const Exception& e) {
-    throw std::string(log_tag + " Wait Model Failed: " + e.what());
+    ORT_THROW(log_tag + " Wait Model Failed: " + e.what());
   } catch (...) {
-    throw std::string(log_tag + " Wait Mode Failed");
+    ORT_THROW(log_tag + " Wait Mode Failed");
   }
 }
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index cf4d867d4df55..2a13fafb99fd3 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -6,14 +6,11 @@
 #include <vector>
 #include <memory>
 
-#define OV_API_20
 #include "openvino/openvino.hpp"
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/frontend/manager.hpp"
 
 #ifdef IO_BUFFER_ENABLED
-#include <gpu/gpu_context_api_ocl.hpp>
-#include <gpu/gpu_config.hpp>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #endif
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 11c8a1629b073..3970bf6ff68a7 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) 2019- Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -6,6 +6,7 @@
 #include "../backend_manager.h"
 #include "capability.h"
 #include "utils.h"
+#include "openvino/core/version.hpp"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -25,20 +26,22 @@ namespace openvino_ep {
 // Constructor
 GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
                              const std::string device_type_param,
-                             const std::string device_precision,
-                             const std::string version_param)
+                             const std::string device_precision)
     : graph_viewer_(graph_viewer_param), device_type_(device_type_param), device_precision_(device_precision) {
-  if (version_param == "V_2023_0") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_, device_precision_);
-  } else if (version_param == "V_2023_1") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
-  } else if (version_param == "V_2023_2") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
-  } else if (version_param == "V_2023_3") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
-  } else {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
+  if (device_type_.find("NPU") != std::string::npos) {
+    device_type_ = "CPU_FP32";
   }
+#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1
+  data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2
+  data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3
+  data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
+#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#else
+  data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_);
+#endif
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
index 2040634cc45d9..d9fe5a95ef833 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -21,8 +21,7 @@ class GetCapability {
  public:
   GetCapability(const GraphViewer& graph_viewer_param,
                 const std::string device_type_param,
-                const std::string precision,
-                const std::string version_param);
+                const std::string precision);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
   bool IsWhollySupportedGraph() {
     return is_wholly_supported_graph_;
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index e829bf377b195..c7c3e93595719 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include <unordered_set>
@@ -14,6 +14,7 @@
 #include "data_ops.h"
 #include "capability.h"
 #include "utils.h"
+#include "../ov_interface.h"
 
 #if defined(_MSC_VER)
 #pragma warning(disable : 4244 4245 5208)
@@ -36,6 +37,7 @@ namespace openvino_ep {
 std::set<std::string> ops_supported_only_in_model = {
     "Add",
     "Cast",
+    "Celu",
     "Concat",
     "ConstantOfShape",
     "DequantizeLinear",
@@ -46,6 +48,7 @@ std::set<std::string> ops_supported_only_in_model = {
     "EyeLike",
     "GatherElements",
     "GatherND",
+    "GridSample",
     "Identity",
     "LayerNormalization",
     "Loop",
@@ -72,293 +75,171 @@ std::set<std::string> ops_supported_only_in_model = {
 std::set<std::string> ops_supported_as_function = {
     "LessOrEqual",
     "GreaterOrEqual",
-    "LayerNormalization"};
+    "LayerNormalization",
+    "Celu"};
 
 std::vector<SupportedOp> supported_op_mode = {
     {"Abs", V_2020_4, {"CPU", "GPU"}},
-    {"Abs", V_2023_0, {"NPU"}},
     {"Acos", V_2020_4, {"CPU"}},
     {"Acos", V_2022_1, {"GPU"}},
-    {"Acos", V_2023_1, {"NPU"}},
     {"Acosh", V_2020_4, {"CPU"}},
     {"Acosh", V_2022_1, {"GPU"}},
-    {"Acosh", V_2023_1, {"NPU"}},
     {"Add", V_2020_4, {"CPU", "GPU"}},
-    {"Add", V_2023_0, {"NPU"}},
     {"And", V_2020_4, {"CPU", "GPU"}},
-    {"And", V_2023_1, {"NPU"}},
     {"ArgMax", V_2020_4, {"CPU"}},
     {"ArgMax", V_2021_1, {"GPU"}},
     {"ArgMin", V_2020_4, {"CPU"}},
     {"ArgMin", V_2022_1, {"GPU"}},
     {"Asin", V_2020_4, {"CPU", "GPU"}},
-    {"Asin", V_2023_1, {"NPU"}},
     {"Asinh", V_2020_4, {"CPU", "GPU"}},
-    {"Asinh", V_2023_1, {"NPU"}},
     {"Atan", V_2020_4, {"CPU", "GPU"}},
-    {"Atan", V_2023_1, {"NPU"}},
     {"Atanh", V_2020_4, {"CPU"}},
     {"Atanh", V_2022_1, {"GPU"}},
-    {"Atanh", V_2023_1, {"NPU"}},
     {"AveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"AveragePool", V_2023_0, {"NPU"}},
     {"BatchNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"BatchNormalization", V_2023_0, {"NPU"}},
     {"BitShift", V_2022_1, {"CPU"}},
-    {"BitShift", V_2023_1, {"NPU"}},
     {"Cast", V_2020_4, {"CPU", "GPU"}},
-    {"Cast", V_2023_0, {"NPU"}},
-    {"CastLike", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"CastLike", V_2023_1, {"CPU", "GPU"}},
     {"Ceil", V_2020_4, {"GPU"}},
     {"Ceil", V_2021_4, {"CPU"}},
-    {"Ceil", V_2023_1, {"NPU"}},
     {"Celu", V_2022_1, {"CPU", "GPU"}},
     {"Clip", V_2020_4, {"CPU", "GPU"}},
-    {"Clip", V_2023_0, {"NPU"}},
     {"Compress", V_2023_1, {"CPU", "GPU"}},
     {"Concat", V_2020_4, {"CPU", "GPU"}},
-    {"Concat", V_2023_0, {"NPU"}},
     {"Constant", V_2020_4, {"CPU", "GPU"}},
-    {"Constant", V_2023_0, {"NPU"}},
     {"ConstantOfShape", V_2020_4, {"CPU", "GPU"}},
-    {"ConstantOfShape", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op in the plugin.
     {"Conv", V_2020_4, {"CPU", "GPU"}},
-    {"Conv", V_2023_0, {"NPU"}},
     {"ConvInteger", V_2022_1, {"CPU", "GPU"}},
-    {"ConvInteger", V_2023_1, {"NPU"}},
     {"ConvTranspose", V_2020_4, {"CPU", "GPU"}},
-    {"ConvTranspose", V_2023_1, {"NPU"}},
     {"Cos", V_2020_4, {"CPU"}},
     {"Cos", V_2022_1, {"GPU"}},
-    {"Cos", V_2023_0, {"NPU"}},
     {"Cosh", V_2020_4, {"CPU"}},
     {"Cosh", V_2022_1, {"GPU"}},
-    {"Cosh", V_2023_1, {"NPU"}},
     {"CumSum", V_2022_1, {"CPU", "GPU"}},
-    {"CumSum", V_2023_0, {"NPU"}},
     {"DepthToSpace", V_2020_4, {"CPU", "GPU"}},
-    {"DepthToSpace", V_2023_0, {"NPU"}},
     {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"DequantizeLinear", V_2023_0, {"NPU"}},
     {"Div", V_2020_4, {"CPU", "GPU"}},
-    {"Div", V_2023_0, {"NPU"}},
     {"Dropout", V_2020_4, {"CPU", "GPU"}},
-    {"Dropout", V_2023_0, {"NPU"}},
     {"Elu", V_2020_4, {"CPU", "GPU"}},
-    {"Elu", V_2023_0, {"NPU"}},
     {"Einsum", V_2023_1, {"CPU", "GPU"}},
     {"Equal", V_2020_4, {"CPU", "GPU"}},
-    {"Equal", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Erf", V_2020_4, {"CPU", "GPU"}},
-    {"Erf", V_2023_0, {"NPU"}},
     {"Exp", V_2020_4, {"CPU", "GPU"}},
-    {"Exp", V_2023_0, {"NPU"}},
     {"Expand", V_2022_1, {"CPU", "GPU"}},
-    {"Expand", V_2023_0, {"NPU"}},  // Gets mapped to broadcast op and multiply op in the plugin.
     {"EyeLike", V_2022_1, {"CPU"}},
-    {"EyeLike", V_2023_0, {"NPU"}},  // NoOP
     {"Flatten", V_2020_4, {"CPU", "GPU"}},
-    {"Flatten", V_2023_0, {"NPU"}},
     {"Floor", V_2020_4, {"CPU", "GPU"}},
-    {"Floor", V_2023_1, {"NPU"}},
     {"Gather", V_2020_4, {"CPU", "GPU"}},
-    {"Gather", V_2023_0, {"NPU"}},
     {"GatherElements", V_2022_2, {"CPU", "GPU"}},
-    {"GatherElements", V_2023_1, {"NPU"}},
     {"GatherND", V_2021_4, {"CPU", "GPU"}},
-    {"GatherND", V_2023_1, {"NPU"}},
+    {"Gelu", V_2023_1, {"CPU", "GPU"}},
     {"Gemm", V_2020_4, {"CPU", "GPU"}},
-    {"Gemm", V_2023_0, {"NPU"}},
     {"GlobalAveragePool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalAveragePool", V_2023_0, {"NPU"}},
     {"GlobalLpPool", V_2020_4, {"CPU", "GPU"}},
-    {"GlobalLpPool", V_2023_1, {"NPU"}},
     {"GlobalMaxPool", V_2022_1, {"CPU", "GPU"}},
-    {"GlobalMaxPool", V_2023_1, {"NPU"}},
     {"Greater", V_2020_4, {"CPU", "GPU"}},
-    {"Greater", V_2023_0, {"NPU"}},
     {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"GreaterOrEqual", V_2023_0, {"NPU"}},
     {"GridSample", V_2022_3, {"CPU"}},
     {"GridSample", V_2023_0, {"GPU"}},
-    {"GridSample", V_2023_1, {"NPU"}},
-    {"HardMax", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"HardMax", V_2023_1, {"CPU", "GPU"}},
     {"Identity", V_2020_4, {"CPU", "GPU"}},
-    {"Identity", V_2023_0, {"NPU"}},  // NoOP
     {"If", V_2022_3, {"CPU", "GPU"}},
-    {"If", V_2023_1, {"NPU"}},
     {"ImageScaler", V_2022_1, {"CPU", "GPU"}},
-    {"ImageScaler", V_2023_0, {"NPU"}},
     {"InstanceNormalization", V_2020_4, {"CPU", "GPU"}},
-    {"InstanceNormalization", V_2023_0, {"NPU"}},
     {"HardSigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"HardSigmoid", V_2023_1, {"NPU"}},
     {"HardMax", V_2022_1, {"CPU", "GPU"}},
+    {"LayerNormalization", V_2023_0, {"CPU", "GPU"}},
     {"LeakyRelu", V_2020_4, {"CPU", "GPU"}},
-    {"LeakyRelu", V_2023_0, {"NPU"}},
     {"Less", V_2020_4, {"CPU", "GPU"}},
-    {"Less", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"LessOrEqual", V_2022_1, {"CPU", "GPU"}},
-    {"LessOrEqual", V_2023_0, {"NPU"}},
     {"Log", V_2020_4, {"CPU", "GPU"}},
-    {"Log", V_2023_0, {"NPU"}},
     {"LogSoftMax", V_2022_1, {"CPU", "GPU"}},
     {"Loop", V_2021_4, {"CPU", "GPU"}},
-    {"LpNormalization", V_2023_1, {"CPU", "GPU", "NPU"}},
-    {"LpPool", V_2023_1, {"CPU", "GPU", "NPU"}},
+    {"LpNormalization", V_2023_1, {"CPU", "GPU"}},
     {"LRN", V_2020_4, {"CPU", "GPU"}},
-    {"LRN", V_2023_0, {"NPU"}},
     {"LSTM", V_2020_4, {"CPU", "GPU"}},
-    {"LSTM", V_2023_1, {"NPU"}},
     {"MatMul", V_2020_4, {"CPU", "GPU"}},
-    {"MatMul", V_2023_0, {"NPU"}},
     {"MatMulInteger", V_2022_1, {"CPU"}},
-    {"MatMulInteger", V_2023_1, {"NPU"}},
     {"Max", V_2020_4, {"CPU", "GPU"}},
-    {"Max", V_2023_0, {"NPU"}},
     {"MaxPool", V_2020_4, {"CPU", "GPU"}},
-    {"MaxPool", V_2023_0, {"NPU"}},
     {"Mean", V_2020_4, {"CPU", "GPU"}},
-    {"Mean", V_2023_0, {"NPU"}},
     {"MeanVarianceNormalization", V_2022_1, {"CPU", "GPU"}},
-    {"MeanVarianceNormalization", V_2023_1, {"NPU"}},
     {"Min", V_2020_4, {"CPU", "GPU"}},
-    {"Min", V_2023_0, {"NPU"}},
     {"Mod", V_2022_1, {"CPU", "GPU"}},
     {"Mul", V_2020_4, {"CPU", "GPU"}},
-    {"Mul", V_2023_0, {"NPU"}},
     {"Neg", V_2020_4, {"CPU", "GPU"}},
-    {"Neg", V_2023_0, {"NPU"}},
     {"NonMaxSuppression", V_2021_1, {"CPU", "GPU"}},
-    {"NonMaxSuppression", V_2023_1, {"NPU"}},
     {"NonZero", V_2021_1, {"CPU"}},
     {"NonZero", V_2023_0, {"GPU"}},
     {"Not", V_2021_1, {"CPU", "GPU"}},
     {"Not", V_2020_4, {"CPU", "GPU"}},
-    {"Not", V_2023_1, {"NPU"}},
     {"OneHot", V_2020_4, {"CPU", "GPU"}},
-    {"OneHot", V_2023_1, {"NPU"}},
     {"Or", V_2022_1, {"CPU", "GPU"}},
-    {"Or", V_2023_1, {"NPU"}},
     {"Pad", V_2020_4, {"CPU", "GPU"}},
-    {"Pad", V_2023_0, {"NPU"}},
     {"Pow", V_2020_4, {"CPU", "GPU"}},
-    {"Pow", V_2023_0, {"NPU"}},
     {"PRelu", V_2020_4, {"CPU", "GPU"}},
-    {"PRelu", V_2023_0, {"NPU"}},
     {"QLinearMatMul", V_2022_3, {"CPU"}},
-    // {"QLinearMatMul", V_2023_1, {"NPU"}},
     {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}},
-    {"QuantizeLinear", V_2023_0, {"NPU"}},
     {"RNN", V_2023_1, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
     {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormalLike", V_2023_1, {"NPU"}},
     {"RandomNormal", V_2023_0, {"CPU", "GPU"}},
-    {"RandomNormal", V_2023_1, {"NPU"}},
     {"Range", V_2022_1, {"CPU", "GPU"}},
-    {"Range", V_2023_0, {"NPU"}},
     {"Reciprocal", V_2020_4, {"CPU", "GPU"}},
-    {"Reciprocal", V_2023_0, {"NPU"}},
     {"ReduceL1", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL1", V_2023_1, {"NPU"}},
     {"ReduceL2", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceL2", V_2023_1, {"NPU"}},
     {"ReduceLogSum", V_2020_4, {"CPU"}},
     {"ReduceLogSum", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSum", V_2023_1, {"NPU"}},
     {"ReduceLogSumExp", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceLogSumExp", V_2023_1, {"NPU"}},
     {"ReduceMax", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMax", V_2023_1, {"NPU"}},
     {"ReduceMean", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMean", V_2023_0, {"NPU"}},
     {"ReduceMin", V_2020_4, {"CPU", "GPU"}},
-    {"ReduceMin", V_2023_1, {"NPU"}},
     {"ReduceProd", V_2020_4, {"CPU"}},
     {"ReduceProd", V_2022_1, {"GPU"}},
-    {"ReduceProd", V_2023_1, {"NPU"}},
     {"ReduceSum", V_2020_4, {"CPU", "GPU"}},
-    // {"ReduceSum", V_2023_1, {"NPU"}},
     {"ReduceSumSquare", V_2020_4, {"CPU"}},
     {"ReduceSumSquare", V_2022_1, {"CPU", "GPU"}},
-    {"ReduceSumSquare", V_2023_1, {"NPU"}},
     {"Relu", V_2020_4, {"CPU", "GPU"}},
-    {"Relu", V_2023_0, {"NPU"}},
     {"Resize", V_2020_4, {"CPU"}},
     {"Resize", V_2022_1, {"GPU"}},
-    {"Resize", V_2023_1, {"NPU"}},
     {"Reshape", V_2020_4, {"CPU", "GPU"}},
-    {"Reshape", V_2023_0, {"NPU"}},
     {"ReverseSequence", V_2022_1, {"CPU", "GPU"}},
     {"RoiAlign", V_2021_1, {"CPU", "GPU"}},
-    {"RoiAlign", V_2023_1, {"NPU"}},
     {"Round", V_2021_4, {"CPU", "GPU"}},
-    {"Round", V_2023_1, {"NPU"}},
     {"Scatter", V_2022_1, {"CPU", "GPU"}},
-    {"Scatter", V_2023_1, {"NPU"}},
     {"ScatterElements", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterElements", V_2023_1, {"NPU"}},
     {"ScatterND", V_2022_1, {"CPU", "GPU"}},
-    {"ScatterND", V_2023_1, {"NPU"}},
     {"Selu", V_2020_4, {"CPU", "GPU"}},
-    {"Selu", V_2023_1, {"NPU"}},
     {"Shape", V_2020_4, {"CPU", "GPU"}},
-    {"Shape", V_2023_0, {"NPU"}},
     {"Shrink", V_2022_1, {"CPU", "GPU"}},
-    {"Shrink", V_2023_0, {"NPU"}},
     {"Sigmoid", V_2020_4, {"CPU", "GPU"}},
-    {"Sigmoid", V_2023_0, {"NPU"}},
     {"Sign", V_2020_4, {"CPU"}},
     {"Sign", V_2022_1, {"GPU"}},
-    {"Sign", V_2023_0, {"NPU"}},
     {"Sin", V_2022_1, {"CPU", "GPU"}},
-    {"Sin", V_2023_0, {"NPU"}},
     {"Sinh", V_2020_4, {"CPU"}},
-    {"Sinh", V_2023_1, {"NPU"}},
     {"Size", V_2022_1, {"CPU", "GPU"}},
-    {"Size", V_2023_1, {"NPU"}},
     {"Slice", V_2020_4, {"CPU", "GPU"}},
-    {"Slice", V_2023_0, {"NPU"}},
     {"Softmax", V_2020_4, {"CPU", "GPU"}},
-    {"Softmax", V_2023_0, {"NPU"}},
     {"Softplus", V_2022_1, {"CPU", "GPU"}},
-    {"Softplus", V_2023_0, {"NPU"}},
     {"Softsign", V_2022_1, {"CPU", "GPU"}},
     {"SpaceToDepth", V_2020_4, {"CPU", "GPU"}},
-    {"SpaceToDepth", V_2023_0, {"NPU"}},
     {"Split", V_2020_4, {"CPU", "GPU"}},
-    {"Split", V_2023_0, {"NPU"}},
     {"Sqrt", V_2020_4, {"CPU", "GPU"}},
-    {"Sqrt", V_2023_0, {"NPU"}},
     {"Squeeze", V_2020_4, {"CPU", "GPU"}},
-    {"Squeeze", V_2023_0, {"NPU"}},
     {"Softsign", V_2020_4, {"CPU"}},
     {"Sub", V_2020_4, {"CPU", "GPU"}},
-    {"Sub", V_2023_0, {"NPU"}},
     {"Sum", V_2020_4, {"CPU", "GPU"}},
-    {"Sum", V_2023_0, {"NPU"}},
     {"Tan", V_2020_4, {"CPU", "GPU"}},
-    {"Tan", V_2023_1, {"NPU"}},
     {"Tanh", V_2020_4, {"CPU", "GPU"}},
-    {"Tanh", V_2023_0, {"NPU"}},
     {"ThresholdedRelu", V_2022_1, {"CPU", "GPU"}},
-    {"ThresholdedRelu", V_2023_0, {"NPU"}},
     {"Tile", V_2021_3, {"CPU", "GPU"}},
-    {"Tile", V_2023_0, {"NPU"}},
     {"Transpose", V_2020_4, {"CPU", "GPU"}},
-    {"Transpose", V_2023_0, {"NPU"}},
     {"Trilu", V_2023_0, {"CPU", "GPU"}},
-    {"Trilu", V_2023_1, {"NPU"}},
     {"TopK", V_2020_4, {"CPU", "GPU"}},
-    {"TopK", V_2023_0, {"NPU"}},
     {"Upsample", V_2020_4, {"CPU", "GPU"}},
     {"Unsqueeze", V_2020_4, {"CPU", "GPU"}},
-    {"Unsqueeze", V_2023_0, {"NPU"}},
     {"Where", V_2022_1, {"CPU", "GPU"}},
-    {"Where", V_2023_0, {"NPU"}},  // Added for whisper decoder model.
     {"Xor", V_2022_1, {"CPU", "GPU"}},
-    {"Xor", V_2023_1, {"NPU"}},
 };
 
 void DataOps::populate_types_supported() {
@@ -370,6 +251,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_initializer_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64));
+  supported_types_initializer_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_initializer_.insert(
       std::make_pair(V_2021_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16));
   supported_types_initializer_.insert(
@@ -387,6 +270,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_npu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_npu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_npu_.insert(
@@ -402,6 +287,8 @@ void DataOps::populate_types_supported() {
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16));
+  supported_types_cpu_.insert(
+      std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16));
   supported_types_cpu_.insert(
       std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8));
   supported_types_cpu_.insert(
@@ -437,13 +324,12 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}});
+  no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}});
   no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
-  no_dimension_supported_.push_back({"Greater", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
-  no_dimension_supported_.push_back({"Max", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}});
@@ -476,9 +362,8 @@ void DataOps::populate_op_mode_supported() {
   {
     UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3},
                              [this](const Node* node, const InitializedTensorSet&) {
-                               // Abs is not supproted with INT8 or INT32 as input data type on GPU and NPU
-                               if ((device_id_.find("GPU") != std::string::npos) ||
-                                   (device_id_.find("NPU") != std::string::npos)) {
+                               // Abs is not supproted with INT8 or INT32 as input data type on GPU
+                               if ((device_id_.find("GPU") != std::string::npos)) {
                                  for (size_t i = 0; i < node->InputDefs().size(); i++) {
                                    if (node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type() ==
                                            ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8 ||
@@ -706,7 +591,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"PRelu", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -821,7 +706,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Squeeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -836,7 +721,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
@@ -961,7 +846,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
   } else {
     auto dtype = type_proto->tensor_type().elem_type();
 
-    if (device_id_.find("NPU") != std::string::npos || device_id_.find("HETERO") != std::string::npos ||
+    if (device_id_.find("HETERO") != std::string::npos ||
         device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) {
       for (auto const& var : supported_types_npu_) {
         if ((var.first <= version_id_) &&
@@ -1063,8 +948,7 @@ bool DataOps::dimension_unsupported(const Node* node) {
   return true;
 }
 
-bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string>>& op_map,
-                                const NodeIndex node_idx) {
+bool DataOps::node_is_supported(const NodeIndex node_idx) {
   const auto& node = graph_viewer_.GetNode(node_idx);
   const auto& optype = node->OpType();
 
@@ -1174,37 +1058,14 @@ bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string
     return false;
   }
 
-  // Check 3b
-  const auto opset = op_map.find(domain);
-  const auto op_fun = ops_supported_as_function.find(node->OpType());
-  if (opset == op_map.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "Failed in Unsupported onnx model domain" << std::endl;
-    }
-#endif
-    return false;
-  }
-  if (opset->second.find(optype) == opset->second.end() && op_fun == ops_supported_as_function.end()) {
-#ifndef NDEBUG
-    if (openvino_ep::backend_utils::IsDebugEnabled()) {
-      std::cout << "The operator is not available in OpenVINO ngraph operators list"
-                << "nor the operator is a special ONNX function"
-                << std::endl;
-    }
-#endif
-    return false;
-  }
   return true;
 }
 
 std::vector<NodeIndex> DataOps::GetUnsupportedNodeIndices(std::unordered_set<std::string>& ng_required_initializers) {
-  const auto ng_supported_ops = GetNgSupportedOps(GetOnnxOpSet(graph_viewer_));
-
   std::vector<NodeIndex> unsupported_nodes_idx;
 
   for (const auto& node_idx : graph_viewer_.GetNodesInTopologicalOrder()) {
-    if (node_is_supported(ng_supported_ops, node_idx)) {
+    if (node_is_supported(node_idx)) {
       // Collect inputs that are initializers
       graph_viewer_.GetNode(node_idx)->ForEachDef([&ng_required_initializers, this](const NodeArg& node_arg,
                                                                                     bool is_input) {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index 87688601ad692..0990904908111 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #pragma once
@@ -26,7 +26,8 @@ enum versionNum {
   V_2023_0,
   V_2023_1,
   V_2023_2,
-  V_2023_3
+  V_2023_3,
+  V_2024_0
 };
 
 using VersionNum = enum versionNum;
@@ -67,9 +68,7 @@ class DataOps {
   bool dimension_unsupported(const Node* node);
   bool unsupported_op_mode(const Node* node);
   bool type_is_supported(const NodeArg* node_arg, bool is_initializer);
-  bool node_is_supported(const std::map<std::string,
-                                        std::set<std::string>>& op_map,
-                         const NodeIndex node_idx);
+  bool node_is_supported(const NodeIndex node_idx);
 
  public:
   DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, const std::string dev_id, const std::string device_precision)
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
index ee0bfddb7dc83..c5ed29df487b4 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
 #include "core/providers/shared_library/provider_api.h"
@@ -11,14 +11,6 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
-#include "openvino/core/deprecated.hpp"
-#define IN_OV_COMPONENT
-#define NGRAPH_LEGACY_HEADER_INCLUDED
-#include <ngraph/frontend/onnx_import/onnx.hpp>
-
-#undef NGRAPH_LEGACY_HEADER_INCLUDED
-#undef IN_OV_COMPONENT
-
 #if defined(_MSC_VER)
 #pragma warning(default : 4244 4245)
 #elif __GNUC__
@@ -95,20 +87,6 @@ int GetOnnxOpSet(const GraphViewer& graph_viewer) {
   return dm_to_ver.at(kOnnxDomain);
 }
 
-std::map<std::string, std::set<std::string>> GetNgSupportedOps(const int onnx_opset) {
-  std::map<std::string, std::set<std::string>> ng_supported_ops;
-  OPENVINO_SUPPRESS_DEPRECATED_START
-  ng_supported_ops.emplace(kOnnxDomain, ngraph::onnx_import::get_supported_operators(onnx_opset, kOnnxDomain));
-
-  const std::set<std::string> ng_disabled_ops = {"LSTM"};  // Place-holder for ops not supported.
-
-  for (const auto& disabled_op : ng_disabled_ops) {
-    ng_supported_ops.at(kOnnxDomain).erase(disabled_op);
-  }
-  OPENVINO_SUPPRESS_DEPRECATED_END
-  return ng_supported_ops;
-}
-
 /**
  * Returns a vector clusters(or node_idx). For each unsupported node, the graph is split into 3 parts.
  * supported_cluster + (UNsupported_node + rest_of_the_graph). This functions returns vector of all supported_clusters by nGraph
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h
index b3edeef88dfec..34aa762ba9b67 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019-2022 Intel Corporation
+// Copyright (C) Intel Corporation
 // Licensed under the MIT License
 #pragma once
 
diff --git a/onnxruntime/core/providers/rocm/nn/pool.cc b/onnxruntime/core/providers/rocm/nn/pool.cc
index 045c8b55c0b0d..3a82ab598004b 100644
--- a/onnxruntime/core/providers/rocm/nn/pool.cc
+++ b/onnxruntime/core/providers/rocm/nn/pool.cc
@@ -257,7 +257,7 @@ Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
   Tensor* I = context->Output(1, TensorShape(y_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<HipT>(
+    MaxPoolWithIndex<HipT, false>(
         this->Stream(context),
         x_shape,
         TensorShape(y_dims),
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 455e0e5f16a42..ed320132169e9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -42,6 +42,26 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     if (!GetShape(*input_defs[a_idx], a_shape, logger)) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A.");
     }
+    std::vector<int64_t> b_shape;
+    if (!GetShape(*input_defs[b_idx], b_shape, logger)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of B.");
+    }
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    bool extended_a_shape = false;
+    if (a_shape.size() == 1) {
+      extended_a_shape = true;
+      a_shape.insert(a_shape.begin(), 1);
+      a = model_builder.GetBuilder().call<emscripten::val>("reshape", a,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(a_shape)));
+    }
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    bool extended_b_shape = false;
+    if (b_shape.size() == 1) {
+      extended_b_shape = true;
+      b_shape.push_back(1);
+      b = model_builder.GetBuilder().call<emscripten::val>("reshape", b,
+                                                           emscripten::val::array(GetVecUint32FromVecInt64(b_shape)));
+    }
     // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case.
     // TODO: Remove this workaround when it is fixed in Chromium.
     if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) {
@@ -49,6 +69,27 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     } else {
       output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
     }
+    // If the inputs are both 1D， reduce the output to a scalar.
+    if (extended_a_shape && extended_b_shape) {
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array());
+    }
+    // After matrix multiplication the prepended 1 is removed.
+    else if (extended_a_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < b_shape.size() - 2; i++) {
+        new_shape.push_back(narrow<uint32_t>(b_shape[i]));
+      }
+      new_shape.push_back(narrow<uint32_t>(b_shape.back()));
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
+    // After matrix multiplication the appended 1 is removed.
+    else if (extended_b_shape) {
+      std::vector<uint32_t> new_shape;
+      for (size_t i = 0; i < a_shape.size() - 1; i++) {
+        new_shape.push_back(narrow<uint32_t>(a_shape[i]));
+      }
+      output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(new_shape));
+    }
   } else if (op_type == "MatMulInteger") {
     emscripten::val a_zero_point = emscripten::val::null();
     emscripten::val b_zero_point = emscripten::val::null();
@@ -152,10 +193,10 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   }
 
   if (op_type == "MatMul") {
-    if (a_shape.size() < 2 || b_shape.size() < 2) {
-      LOGS(logger, VERBOSE) << "Inputs of MatMul must be at least 2D";
-      return false;
-    }
+    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
+    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
+    if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1);
+    if (b_shape.size() == 1) b_shape.push_back(1);
 
     // WebNN CPU backend has two more constraints.
     // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 6e9d68d259a5d..513aafcdadb7d 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -1066,59 +1066,120 @@ Status IsCompatible(const ONNX_NAMESPACE::OpSchema& schema, const OrtCustomOp* o
   return Status::OK();
 }
 
-void InferOutputTypes(const InlinedVector<const KernelDef*>& kernel_defs,
-                      ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-  for (const auto& kernel_def : kernel_defs) {
+// This function attempts to do its best for older custom ops (most of them) who do not have
+// they own type and shape inference function. However, it falls short in some cases, and we leave
+// those for the user to handle in their own inference function.
+static void InferOutputTypes(const ONNX_NAMESPACE::OpSchema& schema, gsl::span<const KernelDef* const> kernel_defs,
+                             ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+  const auto& inputs = schema.inputs();
+  const auto node_input_num = infer_ctx.getNumInputs();
+
+  const KernelDef* def_selected = nullptr;
+  bool is_variadic_input = false;
+  bool is_homogeneous_input = false;
+  int32_t output_propagate{0};
+
+  for (size_t kernel_index = 0;
+       kernel_index < kernel_defs.size() && def_selected == nullptr;
+       ++kernel_index) {
+    const auto* kernel_def = kernel_defs[kernel_index];
     const auto& type_constraints = kernel_def->TypeConstraints();
-    auto num_inputs = infer_ctx.getNumInputs();
-    bool matched = true;
-    ONNXTensorElementDataType undef = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
-    // first, make sure there is a constraint for every input
-    for (size_t i = 0; i < num_inputs && matched; ++i) {
-      auto input_name = "Input" + std::to_string(i);
-      auto input_type = infer_ctx.getInputType(i);
-      if (input_type) {
-        auto elem_type = static_cast<ONNXTensorElementDataType>(input_type->tensor_type().elem_type());
-        auto tc_iter = type_constraints.find(input_name);
-        if (tc_iter != type_constraints.end()) {
-          if (tc_iter->second.size() > 1) {
-            undef = elem_type;
-          } else if (tc_iter->second.size() != 1 ||
-                     tc_iter->second[0] != DataTypeImpl::TensorTypeFromONNXEnum(elem_type)) {
-            matched = false;
+    def_selected = kernel_def;
+
+    for (size_t i = 0; i < node_input_num; ++i) {
+      const auto input_type = infer_ctx.getInputType(i);
+
+      // Guard against variadic parameter index
+      const size_t schema_input_index = (i < inputs.size()) ? i : inputs.size() - 1;
+      const auto& param = inputs[schema_input_index];
+      const auto& input_name = param.GetName();
+      if (input_type == nullptr) {
+        if (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Optional)
+          continue;
+
+        ORT_THROW("[CustomOP type inferencing error]: kernel Input: ", input_name,
+                  " is absent, but not optional. Op : ", schema.Name());
+      }
+
+      is_variadic_input = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+      is_homogeneous_input = param.GetIsHomogeneous();
+
+      if (!is_variadic_input || is_homogeneous_input) {
+        auto hit = type_constraints.find(input_name);
+        if (hit != type_constraints.end()) {
+          const auto& types = hit->second;
+          // For custom ops kernel constraints are never empty
+          assert(!types.empty());
+          if (!std::any_of(types.cbegin(), types.cend(),
+                           [input_type](const DataTypeImpl* type) {
+                             return type->IsCompatible(*input_type);
+                           })) {
+            def_selected = nullptr;
+            output_propagate = 0;
+            break;
+          }
+
+          // If we have multiple types possible from the constraints,
+          // record the last type and use it to guess the output type if
+          // output may have different types. Works well for symmetric single input/outputs
+          // otherwise give up and let the user supply their own function
+          if (types.size() > 1) {
+            output_propagate = input_type->tensor_type().elem_type();
           }
         } else {
-          matched = false;
+          ORT_THROW("[CustomOP type inferencing error]: no type constraint found for input: ",
+                    input_name, " Op: ", schema.Name());
         }
-      } else {
-        matched = false;
-      }
-    }  // for
-    // next, ensure that there is a constraint for every output
-    auto num_outputs = infer_ctx.getNumOutputs();
-    for (size_t i = 0; i < num_outputs && matched; i++) {
-      auto output_name = "Output" + std::to_string(i);
-      auto tc_iter = type_constraints.find(output_name);
-      if (tc_iter == type_constraints.end() || tc_iter->second.size() < 1) {
-        matched = false;
       }
     }
-    if (matched) {
-      for (size_t i = 0; i < num_outputs; i++) {
-        auto output_name = "Output" + std::to_string(i);
-        auto output_type = infer_ctx.getOutputType(i);
-        auto tc_iter = type_constraints.find(output_name);
-        if (tc_iter->second.size() > 1) {
-          output_type->mutable_tensor_type()->set_elem_type(undef);
-        } else {
-          output_type->mutable_tensor_type()->set_elem_type(
-              tc_iter->second[0]->GetTypeProto()->tensor_type().elem_type());
-        }
-      }
+  }
+
+  if (def_selected == nullptr) {
+    ORT_THROW("[CustomOP type inferencing error]: no kernel def matches node inputs for Op: ", schema.Name());
+  }
+
+  const auto& outputs = schema.outputs();
+  const auto node_output_num = infer_ctx.getNumOutputs();
+  const auto& selected_type_constraints = def_selected->TypeConstraints();
+
+  for (size_t i = 0; i < node_output_num; ++i) {
+    auto output_type = infer_ctx.getOutputType(i);
+    // Account for variadic outputs
+    const size_t schema_output_index = (i < outputs.size()) ? i : outputs.size() - 1;
+    const auto& param = outputs[schema_output_index];
+    const auto& output_name = param.GetName();
+
+    const bool is_variadic_output = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic);
+    const bool is_homogeneous = param.GetIsHomogeneous();
+
+    // We give up on variadic non-homogeneous outputs
+    // Let the user handle it in their inference function
+    if (is_variadic_output && !is_homogeneous) {
       break;
     }
+
+    auto hit = selected_type_constraints.find(output_name);
+    if (hit != selected_type_constraints.end()) {
+      const auto& types = hit->second;
+      assert(!types.empty());
+
+      if (types.size() == 1) {
+        // Use the constraint type
+        output_type->mutable_tensor_type()->set_elem_type(
+            types[0]->GetTypeProto()->tensor_type().elem_type());
+      } else if (!is_variadic_input || is_homogeneous_input) {
+        // If not variadic or homogeneous, and there are multiple types possible, guess from the last input type
+        // as this works for symmetric varied single input/outputs
+        // otherwise give up and let the user supply their own function
+        output_type->mutable_tensor_type()->set_elem_type(output_propagate);
+      }
+    } else {
+      ORT_THROW("[CustomOP type inferencing error]: no type constraint found for output: ",
+                output_name, " Op: ", schema.Name());
+    }
   }
 }
+
 #endif
 
 common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domains,
@@ -1178,13 +1239,13 @@ common::Status CreateCustomRegistry(gsl::span<OrtCustomOpDomain* const> op_domai
     }
 
     std::vector<ONNX_NAMESPACE::OpSchema> schemas;
-    for (auto schema_iter : schema_map) {
-      schemas.push_back(schema_iter.second);
-      InlinedVector<const KernelDef*> kernel_defs = std::move(kernel_def_map[schema_iter.first]);
+    for (auto& [name, schema] : schema_map) {
+      schemas.push_back(schema);
       auto infer_fn = schemas.back().GetTypeAndShapeInferenceFunction();
       ONNX_NAMESPACE::InferenceFunction extended_infer_fn =
-          [infer_fn, kernel_defs](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
-            InferOutputTypes(kernel_defs, infer_ctx);
+          [sch = schema, infer_fn = std::move(infer_fn),
+           kernel_defs = std::move(kernel_def_map[name])](ONNX_NAMESPACE::InferenceContext& infer_ctx) {
+            InferOutputTypes(sch, kernel_defs, infer_ctx);
             if (infer_fn) {
               infer_fn(infer_ctx);
             }
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e5e0e81cb7da8..7b56f0c68427a 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -937,6 +937,20 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             ORT_THROW("Invalid value passed for disable_dynamic_shapes: ", option.second);
           }
           OV_provider_options_map[option.first] = option.second;
+        } else if (option.first == "enable_dynamic_shapes") {
+          LOGS_DEFAULT(WARNING) << " Deprecation notice - 'enable_dynamic_shapes' is Deprected. Upgrade the API to disable_dynamic_shapes parameter."
+                                   "Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+          std::string value;
+          if (!(option.second == "True" || option.second == "true" ||
+                option.second == "False" || option.second == "false")) {
+            ORT_THROW("Invalid value passed for enable_dynamic_shapes: ", option.second);
+          }
+          if (option.second == "True" || option.second == "true") {
+            value = "false";
+          } else {
+            value = "true";
+          }
+          OV_provider_options_map["disable_dynamic_shapes"] = value;
         } else if (option.first == "device_id") {
           OV_provider_options_map[option.first] = option.second;
           continue;
@@ -967,7 +981,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) {
         ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported.");
       } else {
-        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
       }
     }
 #endif
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 6827f2c9dfd91..22314610dbee9 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -60,11 +60,8 @@ struct OrtStatus {
 #elif OPENVINO_CONFIG_GPU_FP16
 #define BACKEND_OPENVINO "-OPENVINO_GPU_FP16"
 
-#elif OPENVINO_CONFIG_NPU_FP16
-#define BACKEND_OPENVINO "-OPENVINO_NPU_FP16"
-
-#elif OPENVINO_CONFIG_NPU_U8
-#define BACKEND_OPENVINO "-OPENVINO_NPU_U8"
+#elif OPENVINO_CONFIG_NPU
+#define BACKEND_OPENVINO "-OPENVINO_NPU"
 
 #elif OPENVINO_CONFIG_MULTI
 #define BACKEND_OPENVINO "-OPENVINO_MULTI"
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 16cbc8e8099e1..10d9f469863c4 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -22,7 +22,7 @@ def check_distro_info():
         __my_distro__ = __my_system__
         __my_distro_ver__ = platform.release().lower()
 
-        if __my_distro_ver__ != "10":
+        if __my_distro_ver__ not in ["10", "11"]:
             warnings.warn(
                 "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only."
                 % __my_distro_ver__
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
index dd53fe6127462..2cfdd39bc96aa 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh
@@ -4,13 +4,14 @@
 
 set -x
 
-while getopts p:o:l:s: parameter
+while getopts p:o:l:s:c: parameter
 do case "${parameter}"
 in
 p) WORKSPACE=${OPTARG};;
 o) ORT_BINARY_PATH=${OPTARG};;
 l) BUILD_ORT_LATEST=${OPTARG};;
 s) ORT_SOURCE=${OPTARG};;
+c) CONCURRENCY=${OPTARG};;
 esac
 done
 
@@ -104,6 +105,26 @@ fi
 
 mv valgrind.log result
 
+# Concurrency Test
+FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/"
+
+mkdir FasterRCNN-10/
+cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/
+
+# replicate test inputs
+for (( i=1; i<CONCURRENCY; i++ )); do
+    cp -r "./FasterRCNN-10/test_data_set_0/" "./FasterRCNN-10/test_data_set_$i/"
+done
+
+pip install onnx requests packaging
+python ${ORT_SOURCE}/onnxruntime/python/tools/symbolic_shape_infer.py \
+    --input="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --output="./FasterRCNN-10/faster_rcnn_R_50_FPN_1x.onnx" \
+    --auto_merge
+
+${ORT_SOURCE}/build/Linux/Release/onnx_test_runner -e tensorrt -c ${CONCURRENCY} -r 100 ./FasterRCNN-10/ > concurrency_test.log 2>&1
+mv concurrency_test.log result
+
 # Run AddressSanitizer 
 ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
index 4e94c63ee6c25..a355e4cf5d365 100755
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh
@@ -3,13 +3,14 @@
 set -x
 
 # Parse Arguments
-while getopts w:d:p:l: parameter
+while getopts w:d:p:l:c: parameter
 do case "${parameter}"
 in 
 w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime
 d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine
 p) MEM_TEST_DIR=${OPTARG};; # mem test dir
 l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT
+c) CONCURRENCY=${OPTARG};;
 esac
 done 
 
@@ -24,4 +25,4 @@ then
     BUILD_ORT_LATEST="true"
 fi
 
-docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST
+docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 363fa3a96d283..df389ad572596 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -3,6 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 import argparse
+import csv
 import datetime
 import os
 import sys
@@ -419,10 +420,11 @@ def main():
     upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0)
 
     try:
+        # Load EP Perf test results from /result
         result_file = args.report_folder
-
-        folders = os.listdir(result_file)
-        os.chdir(result_file)
+        result_perf_test_path = os.path.join(result_file, "result")
+        folders = os.listdir(result_perf_test_path)
+        os.chdir(result_perf_test_path)
 
         tables = [
             fail_name,
@@ -445,13 +447,13 @@ def main():
         for model_group in folders:
             os.chdir(model_group)
             csv_filenames = os.listdir()
-            for csv in csv_filenames:
-                table = pd.read_csv(csv)
-                if session_name in csv:
+            for csv_file in csv_filenames:
+                table = pd.read_csv(csv_file)
+                if session_name in csv_file:
                     table_results[session_name] = pd.concat(
                         [table_results[session_name], get_session(table, model_group)], ignore_index=True
                     )
-                elif specs_name in csv:
+                elif specs_name in csv_file:
                     table_results[specs_name] = pd.concat(
                         [
                             table_results[specs_name],
@@ -459,12 +461,12 @@ def main():
                         ],
                         ignore_index=True,
                     )
-                elif fail_name in csv:
+                elif fail_name in csv_file:
                     table_results[fail_name] = pd.concat(
                         [table_results[fail_name], get_failures(table, model_group)],
                         ignore_index=True,
                     )
-                elif latency_name in csv:
+                elif latency_name in csv_file:
                     table_results[memory_name] = pd.concat(
                         [table_results[memory_name], get_memory(table, model_group)],
                         ignore_index=True,
@@ -474,11 +476,11 @@ def main():
                         [table_results[latency_name], get_latency(table, model_group)],
                         ignore_index=True,
                     )
-                elif status_name in csv:
+                elif status_name in csv_file:
                     table_results[status_name] = pd.concat(
                         [table_results[status_name], get_status(table, model_group)], ignore_index=True
                     )
-                elif op_metrics_name in csv:
+                elif op_metrics_name in csv_file:
                     table = table.assign(Group=model_group)
                     table_results[op_metrics_name] = pd.concat(
                         [table_results[op_metrics_name], table], ignore_index=True
@@ -512,6 +514,43 @@ def main():
                 args.commit_datetime,
             )
 
+        # Load concurrency test results
+        result_mem_test_path = os.path.join(result_file, "result_mem_test")
+        os.chdir(result_mem_test_path)
+        log_path = "concurrency_test.log"
+        if os.path.exists(log_path):
+            print("Generating concurrency test report")
+            with open(log_path) as log_file:
+                log_content = log_file.read()
+
+            failed_cases_section = log_content.split("Failed Test Cases:")[1]
+
+            # passed = 1 if no failed test cases
+            if failed_cases_section.strip() == "":
+                passed = 1
+            else:
+                passed = 0
+
+            csv_path = "concurrency_test.csv"
+            with open(csv_path, "w", newline="") as csv_file:
+                csv_writer = csv.writer(csv_file)
+                csv_writer.writerow(["Passed", "Log"])
+                csv_writer.writerow([passed, log_content])
+
+            db_table_name = "ep_concurrencytest_record"
+            table = pd.read_csv(csv_path)
+            write_table(
+                ingest_client,
+                args.database,
+                table,
+                db_table_name,
+                upload_time,
+                identifier,
+                args.branch,
+                args.commit_hash,
+                args.commit_datetime,
+            )
+
     except BaseException as e:
         print(str(e))
         sys.exit(1)
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index f9552e02d74b9..2e8cd3e1ac7f9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -1,7 +1,14 @@
 # Contents
  - [LLaMA-2](#llama-2)
+   - [Prerequisites](#prerequisites)
    - [Exporting LLaMA-2](#exporting-llama-2)
+   - [Examples of Exporting LLaMA-2](#examples-of-exporting-llama-2)
+   - [Parity Checking LLaMA-2](#parity-checking-llama-2)
    - [Benchmarking LLaMA-2](#benchmark-llama-2)
+     - [Variants](#variants)
+     - [Benchmark All](#benchmark-all)
+     - [Benchmark E2E](#benchmark-e2e)
+   - [E2E Inference with LLaMA-2](#e2e-inference-with-llama-2)
  - [Mistral](#mistral)
    - [Exporting Mistral](#exporting-mistral)
    - [Optimizing and Quantizing Mistral](#optimizing-and-quantizing-mistral)
@@ -229,6 +236,55 @@ $ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudn
 $ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa
 ```
 
+## Parity Checking LLaMA-2
+
+Here are some examples of how you can use the parity checker to verify your LLaMA-2 ONNX model.
+
+1. Merged ONNX model, FP32 CPU
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cpu \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+2. Merged ONNX model, FP32 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp32 \
+    --cache_dir ./model_cache \
+```
+
+3. Merged ONNX model, FP16 CUDA
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
+4. Merged ONNX model, FP16 CUDA with GroupQueryAttention
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.llama_parity \
+    --model_name meta-llama/Llama-2-7b-hf \
+    --onnx_model_path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --merged \
+    --use_gqa \
+    --execution_provider cuda \
+    --precision fp16 \
+    --cache_dir ./model_cache \
+```
+
 ## Benchmark LLaMA-2
 
 Here are some examples of how you can benchmark LLaMA-2.
@@ -240,6 +296,7 @@ Here are some examples of how you can benchmark LLaMA-2.
 CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -252,6 +309,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
 CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -265,6 +323,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -278,6 +337,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -291,6 +351,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -303,6 +364,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -315,6 +377,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp32 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -327,6 +390,7 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -339,6 +403,7 @@ CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \
     --benchmark-type ort-convert-to-onnx \
     --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \
     --model-name meta-llama/Llama-2-70b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --device cuda \
     --warmup-runs 5 \
@@ -357,6 +422,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --ort-convert-to-onnx-model-path ./llama2-7b-fp16/Llama-2-7b-hf_decoder_merged_model_fp16.onnx \
     --ort-msft-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
     --precision fp16 \
     --batch-sizes "1 2" \
     --sequence-lengths "8 16" \
@@ -366,6 +432,72 @@ CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --timeout 60  # number of minutes before moving to the next benchmark
 ```
 
+### Benchmark E2E
+You can use `benchmark_e2e.py` to benchmark the full end-to-end scenario and automatically store the results in a CSV file. This tool uses `argmax` for sampling to standardize the benchmarking process.
+
+1. PyTorch without `torch.compile`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-eager \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+2. PyTorch with `torch.compile`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type pt-compile \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --auth
+```
+
+3. ONNX Runtime with `convert_to_onnx`, FP32
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp32 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cpu \
+    --auth
+```
+
+4. ONNX Runtime with `convert_to_onnx`, FP16
+```
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_e2e \
+    --benchmark-type ort \
+    --model-name meta-llama/Llama-2-7b-hf \
+    --cache-dir ./model_cache \
+    --onnx-model-path ./llama2-7b/rank_0_Llama-2-7b-hf_decoder_merged_model_fp32.onnx \
+    --prompts-file ./models/llama/prompts.json \
+    --precision fp16 \
+    --batch-sizes "1 2" \
+    --prompt-lengths "16 64" \
+    --device cuda \
+    --use_buffer_share \
+    --auth
+```
+
+## E2E Inference with LLaMA-2
+
+For end-to-end inference, please visit the [ONNX Runtime Inference Examples folder](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/python/models/llama) for a step-by-step walkthrough, code examples, and performance metrics.
+
 # Mistral
 
 ## Introduction
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index bfe108d21a595..6184298c471ac 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import gc
@@ -14,11 +19,12 @@
 from benchmark_helper import measure_memory, setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     get_merged_sample_with_past_kv_inputs,
     get_msft_sample_inputs,
     get_sample_inputs,
     get_sample_with_past_kv_inputs,
+    verify_ort_inputs,
 )
 from optimum.onnxruntime import ORTModelForCausalLM
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -199,6 +205,7 @@ def get_model(args: argparse.Namespace):
             torch_dtype=torch.float16 if args.use_fp16 else torch.float32,
             use_auth_token=args.auth,
             use_cache=True,
+            cache_dir=args.cache_dir,
         ).to(args.target_device)
         end_time = time.time()
 
@@ -444,24 +451,12 @@ def get_logits(inputs):
 
 def run_ort_inference(args, init_inputs, iter_inputs, model):
     def prepare_ort_inputs(inputs, kv_cache_ortvalues):
-        # Check that all model inputs will be provided
-        model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
-        user_inputs = set(inputs.keys())
-        missing_inputs = model_inputs - user_inputs
-        if len(missing_inputs):
-            logger.error(f"The following model inputs are missing: {missing_inputs}")
-            raise Exception("There are missing inputs to the model. Please add them and try again.")
-
-        # Remove unnecessary inputs from model inputs
-        unnecessary_inputs = user_inputs - model_inputs
-        if len(unnecessary_inputs):
-            for unnecessary_input in unnecessary_inputs:
-                logger.info(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
-                del inputs[unnecessary_input]
+        # Verify model inputs
+        inputs = verify_ort_inputs(model, inputs)
 
         # Add IO bindings for non-CPU execution providers
         if args.device != "cpu":
-            io_binding, kv_cache_ortvalues = add_io_bindings(
+            io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
                 model, inputs, args.device, int(args.rank), args.use_gqa, kv_cache_ortvalues
             )
             setattr(args, "io_binding", io_binding)  # noqa: B010
@@ -612,6 +607,13 @@ def get_args(rank=0):
     parser.add_argument("--pt-num-rows", type=int, default=1000, help="Number of rows for PyTorch profiler to display")
     parser.add_argument("--verbose", default=False, action="store_true")
     parser.add_argument("--log-folder", type=str, default=os.path.join("."), help="Folder to cache log files")
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        required=True,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
 
     args = parser.parse_args()
 
@@ -662,8 +664,8 @@ def main():
 
     args.rank = rank
     args.world_size = world_size
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    config = AutoConfig.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.cache_dir)
+    config = AutoConfig.from_pretrained(args.model_name, cache_dir=args.cache_dir)
     target_device = f"cuda:{args.rank}" if args.device != "cpu" else args.device
     use_fp16 = args.precision == "fp16"
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
index c6d550d47cf4c..2433ae3d9b5ee 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 import datetime
 import json
@@ -78,6 +83,13 @@ def get_args():
         help="Path to ONNX model from convert_to_onnx",
     )
 
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="./model_cache",
+        help="Cache dir where Hugging Face files are stored",
+    )
+
     parser.add_argument(
         "--model-name",
         type=str,
@@ -332,6 +344,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch without torch.compile")
@@ -362,6 +376,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark PyTorch with torch.compile")
@@ -394,6 +410,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
             "--auth",
         ]
         logger.info("Benchmark Optimum + ONNX Runtime")
@@ -426,6 +444,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark Microsoft model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "ort-msft")
@@ -457,6 +477,8 @@ def main():
             str(args.num_runs),
             "--log-folder",
             args.log_folder,
+            "--cache-dir",
+            args.cache_dir,
         ]
         logger.info("Benchmark convert_to_onnx model in ONNX Runtime")
         results = benchmark(args, benchmark_cmd, "onnxruntime")
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
new file mode 100644
index 0000000000000..4d0d2e68e8983
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py
@@ -0,0 +1,554 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+# This is an end-to-end benchmarking script for the Hugging Face LLaMA-2 model.
+#
+# Prerequisites:
+# 1) Install `huggingface-cli`:
+#
+# $ pip install huggingface_hub
+#
+# 2) Authenticate with Hugging Face's CLI:
+#
+# $ huggingface-cli login
+#
+# 3) Accept Meta's license in Hugging Face to access the models at https://huggingface.co/meta-llama/
+#
+# 4) Install the latest ONNX Runtime version
+#
+# $ pip install onnxruntime-gpu
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import gc
+import itertools
+import json
+import logging
+import os
+import textwrap
+import time
+
+import numpy as np
+import pandas as pd
+import torch
+from benchmark_helper import setup_logger
+from llama_inputs import add_io_bindings_as_tensors, get_initial_inputs_and_outputs
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+import onnxruntime as ort
+
+logger = logging.getLogger(__name__)
+
+
+def get_model(args):
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+            cache_dir=args.cache_dir,
+            torch_dtype=args.torch_dtype,
+            use_auth_token=args.auth,
+            use_cache=True,
+        ).to(args.target_device)
+        model.eval()
+
+        if args.benchmark_type == "pt-compile":
+            model = torch.compile(model)
+
+    else:
+        sess_options = ort.SessionOptions()
+        ep = (
+            ("CUDAExecutionProvider", {"device_id": args.device_id})
+            if args.device == "cuda"
+            else "CPUExecutionProvider"
+        )
+        model = ort.InferenceSession(args.onnx_model_path, sess_options=sess_options, providers=[ep])
+
+    return model
+
+
+def run_inference(args, model, runs, inputs, outputs):
+    if args.benchmark_type == "pt-compile":
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+    # Synchronize inputs
+    io_binding = None
+    if args.benchmark_type in {"pt-eager", "pt-compile"}:
+        if args.device != "cpu":
+            torch.cuda.synchronize(args.target_device)
+    else:
+        io_binding = add_io_bindings_as_tensors(model, inputs, outputs, args.use_fp16, args.use_buffer_share)
+        io_binding.synchronize_inputs()
+
+    # Run inference
+    start = time.perf_counter()
+    for _ in range(runs):
+        if args.benchmark_type in {"pt-eager", "pt-compile"}:
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if args.device != "cpu":
+                    torch.cuda.synchronize(args.target_device)
+        else:
+            model.run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+
+    end = time.perf_counter()
+    avg = (end - start) / runs
+    return avg, outputs
+
+
+def prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt):
+    clear_cache()
+    inputs, outputs = get_initial_inputs_and_outputs(
+        config, tokenizer, prompt_length, prompt, args.target_device, args.use_fp16, args.use_buffer_share, args.engine
+    )
+    _, outputs = run_inference(args, model, args.warmup_runs, inputs, outputs)
+    return inputs, outputs
+
+
+def clear_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def save_results(results, filename, gen_length):
+    df = pd.DataFrame(
+        results,
+        columns=[
+            "Batch Size",
+            "Prompt Length",
+            "Prompt Processing Latency (ms)",
+            "Prompt Processing Throughput (tps)",
+            "Sampling Latency (ms)",
+            "Sampling Throughput (tps)",
+            "First Token Generated Latency (ms)",
+            "First Token Generated Throughput (tps)",
+            f"Average Latency of First {gen_length // 2} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length // 2} Tokens Generated (tps)",
+            f"Average Latency of First {gen_length} Tokens Generated (ms)",
+            f"Average Throughput of First {gen_length} Tokens Generated (tps)",
+            "Wall-Clock Latency (s)",
+            "Wall-Clock Throughput (tps)",
+        ],
+    )
+
+    df.to_csv(filename, index=False)
+    logger.info(f"Results saved in {filename}!")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-bt",
+        "--benchmark-type",
+        type=str,
+        required=True,
+        choices=["pt-eager", "pt-compile", "ort"],
+    )
+
+    parser.add_argument(
+        "-m",
+        "--model-name",
+        type=str,
+        required=False,
+        help="Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf')",
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        default=False,
+        action="store_true",
+        help="Use Hugging Face authentication token to access model",
+    )
+
+    parser.add_argument(
+        "-c",
+        "--cache-dir",
+        type=str,
+        default=os.path.join(".", "model_cache"),
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.",
+    )
+
+    parser.add_argument(
+        "--hf-dir-path",
+        type=str,
+        default="",
+        help="Path to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--onnx-model-path",
+        required=False,
+        help="Path to ONNX model",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--prompts-file",
+        required=True,
+        default=os.path.join(".", "models", "llama", "prompts.json"),
+        help="JSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt",
+    )
+
+    parser.add_argument(
+        "--use_buffer_share",
+        default=False,
+        action="store_true",
+        help="Use when GroupQueryAttention (GQA) is in ONNX model",
+    )
+
+    parser.add_argument(
+        "--anomaly-filtering",
+        default=False,
+        action="store_true",
+        help="Use this flag to filter anomaly accelerator times for tokens generated. \
+              This may give more accurate latency and throughput metrics for tokens generated. \
+              Wall-clock metrics are still reported with anomaly times though.",
+    ),
+
+    parser.add_argument(
+        "-b",
+        "--batch-sizes",
+        default="1 2",
+    )
+
+    parser.add_argument(
+        "-s",
+        "--prompt-lengths",
+        default="32 64 128 256 512",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        required=True,
+        type=str,
+        default="fp32",
+        choices=["int4", "int8", "fp16", "fp32"],
+        help="Precision for model. For ONNX models, the model's precision should be set before running this script.",
+    )
+
+    parser.add_argument(
+        "-g",
+        "--generation-length",
+        type=int,
+        default=256,
+        help="Number of new tokens to generate",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        choices=["cpu", "cuda"],
+    )
+
+    parser.add_argument("-id", "--device-id", type=int, default=0)
+    parser.add_argument("-w", "--warmup-runs", type=int, default=5)
+    parser.add_argument("-n", "--num-runs", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=2)
+
+    args = parser.parse_args()
+
+    # Set seed properties
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    # Set runtime properties
+    if "ort" in args.benchmark_type:
+        setattr(args, "execution_provider", f"{args.device.upper()}ExecutionProvider")  # noqa: B010
+        if args.execution_provider == "CUDAExecutionProvider":
+            args.execution_provider = (args.execution_provider, {"device_id": args.device_id})
+
+    # Check that paths have been specified for any benchmarking with ORT
+    if args.benchmark_type == "ort":
+        assert args.onnx_model_path, "Please specify a path to `--onnx-model-path`"
+
+    args.batch_sizes = args.batch_sizes.split(" ")
+    args.prompt_lengths = args.prompt_lengths.split(" ")
+
+    # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models
+    args.precision = (
+        "fp32" if args.precision in {"int8", "fp32"} or (args.precision == "int4" and args.device == "cpu") else "fp16"
+    )
+
+    target_device = f"cuda:{args.device_id}" if args.device != "cpu" else args.device
+    torch_dtype = torch.float16 if args.precision == "fp16" else torch.float32
+    engine = "ort" if args.benchmark_type == "ort" else "pt"
+    setattr(args, "target_device", target_device)  # noqa: B010
+    setattr(args, "torch_dtype", torch_dtype)  # noqa: B010
+    setattr(args, "engine", engine)  # noqa: B010
+    setattr(args, "use_fp16", args.precision == "fp16")  # noqa: B010
+
+    return args
+
+
+def main():
+    args = get_args()
+    setup_logger(False)
+    logger.info(args.__dict__)
+
+    # Get prompts and prompt sizes
+    size_to_prompt = None
+    with open(args.prompts_file) as f:
+        size_to_prompt = json.load(f, object_hook=lambda d: {int(k): v for k, v in d.items()})
+
+    # Get config, tokenizer, and model
+    config = AutoConfig.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.hf_dir_path if args.hf_dir_path != "" else args.model_name,
+        cache_dir=args.cache_dir,
+        use_auth_token=args.auth,
+    )
+    model = get_model(args)
+
+    all_csv_metrics = []
+    for batch_size, prompt_length in itertools.product(args.batch_sizes, args.prompt_lengths):
+        batch_size, prompt_length = int(batch_size), int(prompt_length)  # noqa: PLW2901
+        logger.info(f"Running batch size = {batch_size}, prompt length = {prompt_length}")
+        clear_cache()
+        max_length = prompt_length + args.generation_length
+
+        if prompt_length not in size_to_prompt:
+            raise NotImplementedError(
+                textwrap.dedent(
+                    f"""
+                                A prompt of size {prompt_length} was not found in '{args.prompts_file}'. There are a couple of solutions to fix this.
+                                1) You can change one of the keys in '{args.prompts_file}' to be {prompt_length}.
+                                    If {prompt_length} < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until {prompt_length} = actual prompt's length.
+                                    If {prompt_length} > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that {prompt_length} = actual prompt's length.
+                                2) You can add a new key-value entry in '{args.prompts_file}' of the form '{prompt_length}': 'your prompt goes here'.
+                """
+                )
+            )
+        prompt = [size_to_prompt[prompt_length]] * batch_size
+        csv_metrics = [batch_size, prompt_length]
+
+        try:
+            # Measure prompt processing
+            logger.info("Measuring prompt processing...")
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+            accelerator_prompt_latency_s, outputs = run_inference(args, model, args.num_runs, inputs, outputs)
+
+            # Calculate prompt metrics
+            accelerator_prompt_latency_ms = accelerator_prompt_latency_s * 1000
+            accelerator_prompt_thrpt = batch_size * (prompt_length / accelerator_prompt_latency_s)
+            logger.info(f"Average Latency of Prompt Processing: {accelerator_prompt_latency_ms} ms")
+            logger.info(
+                f"Average Throughput of Prompt Processing: {batch_size * (prompt_length / accelerator_prompt_latency_s)} tps"
+            )
+            csv_metrics.extend([accelerator_prompt_latency_ms, accelerator_prompt_thrpt])
+
+            # Measure token generation
+            logger.info("Measuring token generation...")
+            clear_cache()
+            inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt)
+
+            all_token_ids = inputs["input_ids"].clone()
+            current_length = all_token_ids.shape[-1]
+            num_heads = config.num_key_value_heads
+            head_size = (
+                config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+            )
+
+            has_eos = torch.zeros(batch_size, device=args.target_device, dtype=torch.bool)
+
+            # 0th entry will have prompt accelerator time, 1st entry onwards will have token generation accelerator time
+            accelerator_times = []
+            sampling_times = []  # cost to sample after each model run
+
+            wall_clock_start_time = time.perf_counter()
+            while current_length <= max_length:
+                # Run inference
+                accelerator_time_latency_s, outputs = run_inference(args, model, 1, inputs, outputs)
+                accelerator_times.append(accelerator_time_latency_s)
+
+                # Sample with argmax (greedy search)
+                sampling_start_time = time.perf_counter()
+                if outputs["logits"].shape[1] > 1:
+                    prompt_end_indices = inputs["attention_mask"].sum(1) - 1
+                    idxs = (
+                        prompt_end_indices.unsqueeze(dim=1)
+                        .repeat(1, config.vocab_size)
+                        .view(batch_size, 1, config.vocab_size)
+                    )
+                    next_token_logits = torch.gather(outputs["logits"], 1, idxs).squeeze()
+                else:
+                    next_token_logits = outputs["logits"][:, -1, :]
+                next_tokens = torch.argmax(next_token_logits, dim=-1)
+
+                # Check if we previously reached EOS token id or if generated token id is EOS token id
+                has_eos = has_eos | next_tokens == tokenizer.eos_token_id
+
+                # Determine which new tokens to add to list of all token ids
+                # Add EOS token ids for batch entries that ended early (ragged batching scenario where some batch entries ended early and some haven't)
+                tokens_to_add = next_tokens.masked_fill(has_eos, tokenizer.eos_token_id).reshape([batch_size, 1])
+                sampling_end_time = time.perf_counter()
+                sampling_times.append(sampling_end_time - sampling_start_time)
+
+                all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1)
+
+                # Return early if all batch entries have reached EOS token id
+                current_length += 1
+                if torch.all(has_eos) or current_length > max_length:
+                    break
+
+                # Update inputs for next inference run
+                inputs["input_ids"] = tokens_to_add
+                inputs["attention_mask"] = torch.cat(
+                    [inputs["attention_mask"], (~has_eos).to(torch.int64).reshape(batch_size, 1)], 1
+                )
+                inputs["position_ids"] = (
+                    None
+                    if "position_ids" not in inputs
+                    else torch.max(inputs["position_ids"], dim=1)[0].reshape(batch_size, 1) + 1
+                )
+
+                # Set logits to zeros for next inference run and re-use memory buffer
+                if outputs["logits"].shape[1] != 1:
+                    outputs["logits"] = outputs["logits"][:, :1, :].contiguous()
+                outputs["logits"].zero_()
+
+                # Update KV caches for next inference run
+                if args.engine == "pt":
+                    # Update KV caches for PyTorch
+                    inputs["past_key_values"] = outputs["past_key_values"]
+                elif not args.use_buffer_share:
+                    # Update KV caches for ONNX Runtime if buffer sharing is not used
+                    for i in range(config.num_hidden_layers):
+                        inputs[f"past_key_values.{i}.key"] = outputs[f"present.{i}.key"]
+                        inputs[f"past_key_values.{i}.value"] = outputs[f"present.{i}.value"]
+
+                    new_sequence_length = inputs["attention_mask"].shape[1]
+                    for i in range(config.num_hidden_layers):
+                        present_key = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        present_value = torch.zeros(
+                            batch_size,
+                            num_heads,
+                            new_sequence_length,
+                            head_size,
+                            device=args.target_device,
+                            dtype=args.torch_dtype,
+                        )
+                        outputs.update(
+                            {
+                                f"present.{i}.key": present_key.contiguous(),
+                                f"present.{i}.value": present_value.contiguous(),
+                            }
+                        )
+
+            wall_clock_end_time = time.perf_counter()
+
+            # Filter out any anomaly accelerator times (e.g. for `torch.compile`)
+            accelerator_times.pop(0)  # Remove prompt processing time
+            if args.anomaly_filtering:
+                anomaly_threshold_factor = 10
+                min_time_s = min(accelerator_times)
+                orig_size = len(accelerator_times)
+                accelerator_times = list(
+                    filter(lambda acc_time: acc_time < anomaly_threshold_factor * min_time_s, accelerator_times)
+                )
+                new_size = len(accelerator_times)
+                logger.info(
+                    f"Filtered out {orig_size - new_size} anomaly accelerator times that are {anomaly_threshold_factor}x greater than {min_time_s * 1000} ms..."
+                )
+
+            #######################################################
+            # Calculate sampling and first token generated metrics
+            #######################################################
+
+            # Calculate sampling metrics
+            avg_sampling_latency_s = sum(sampling_times) / len(sampling_times)
+            avg_sampling_latency_ms = avg_sampling_latency_s * 1000
+            avg_sampling_thrpt = batch_size * (1 / avg_sampling_latency_s)
+            logger.info(f"Average Latency of Sampling: {avg_sampling_latency_ms} ms")
+            logger.info(f"Average Throughput of Sampling: {avg_sampling_thrpt} tps")
+
+            # Calculate first token generated metrics
+            first_token_latency_s = accelerator_times[0]
+            first_token_latency_ms = first_token_latency_s * 1000
+            first_token_thrpt = batch_size * (1 / first_token_latency_s)
+            logger.info(f"Latency of First Token Generated: {first_token_latency_ms} ms")
+            logger.info(f"Throughput of First Token Generated: {first_token_thrpt} tps")
+
+            ####################################################
+            # Calculate first `halfway` token generated metrics
+            ####################################################
+
+            halfway = args.generation_length // 2
+            halfway_token_latency_s = sum(accelerator_times[:halfway]) / len(accelerator_times[:halfway])
+            halfway_token_latency_ms = halfway_token_latency_s * 1000
+            halfway_token_thrpt = batch_size * (1 / halfway_token_latency_s)
+            logger.info(f"Average Latency of First {halfway} Tokens Generated: {halfway_token_latency_ms} ms")
+            logger.info(f"Average Throughput of First {halfway} Tokens Generated: {halfway_token_thrpt} tps")
+
+            #########################################
+            # Calculate all tokens generated metrics
+            #########################################
+
+            all_token_latency_s = sum(accelerator_times) / len(accelerator_times)
+            all_token_latency_ms = all_token_latency_s * 1000
+            all_token_thrpt = batch_size * (1 / all_token_latency_s)
+            logger.info(
+                f"Average Latency of First {args.generation_length} Tokens Generated: {all_token_latency_ms} ms"
+            )
+            logger.info(f"Average Throughput of First {args.generation_length} Tokens Generated: {all_token_thrpt} tps")
+
+            ###############################
+            # Calculate wall clock metrics
+            ###############################
+
+            wall_clock_latency_s = wall_clock_end_time - wall_clock_start_time
+            wall_clock_thrpt = batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)
+            logger.info(f"Wall-Clock Latency: {wall_clock_latency_s} s")
+            logger.info(
+                f"Wall-Clock Throughput: {batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)} tps"
+            )
+
+            # Add metrics to CSV
+            logger.info("Adding results to CSV")
+            csv_metrics.extend(
+                [
+                    avg_sampling_latency_ms,
+                    avg_sampling_thrpt,
+                    first_token_latency_ms,
+                    first_token_thrpt,
+                    halfway_token_latency_ms,
+                    halfway_token_thrpt,
+                    all_token_latency_ms,
+                    all_token_thrpt,
+                    wall_clock_latency_s,
+                    wall_clock_thrpt,
+                ]
+            )
+            all_csv_metrics.append(csv_metrics)
+
+        except:  # noqa: E722
+            logger.info(f"Could not benchmark at batch size = {batch_size}, prompt length = {prompt_length}")
+
+    filename = f"benchmark_{args.engine}_e2e_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv"
+    save_results(all_csv_metrics, filename, args.generation_length)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index 1ad58327b7fc2..b649f7ab65049 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import argparse
diff --git a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
index 72192ce8d8c63..3b53f60758b27 100644
--- a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
+++ b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import os
 
 import torch.distributed as dist
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index 18202f4b81c0f..5aed55c12f38f 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -1,8 +1,13 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import numpy as np
 import torch
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoTokenizer
 
 from onnxruntime import InferenceSession, OrtValue
 
@@ -269,6 +274,8 @@ def convert_inputs_for_ort(
     return ort_inputs
 
 
+# Re-allocate KV caches from (batch_size, num_heads, past_sequence_length, head_size) to
+# (batch_size, num_heads, max_sequence_length, head_size) for past-present buffer sharing
 def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_seq_len: int):
     for k, v in ort_inputs.items():
         # Allocate new buffers with max_sequence_length for GQA
@@ -281,8 +288,29 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se
     return ort_inputs
 
 
-# Add IO bindings for execution providers
-def add_io_bindings(
+# Verify ONNX Runtime inputs with model
+def verify_ort_inputs(model: InferenceSession, ort_inputs: dict):
+    # Check that all model inputs will be provided
+    model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs()))
+    user_inputs = set(ort_inputs.keys())
+    missing_inputs = model_inputs - user_inputs
+    if len(missing_inputs):
+        print(f"The following model inputs are missing: {missing_inputs}")
+        raise Exception("There are missing inputs to the model. Please add them and try again.")
+
+    # Remove unnecessary inputs from model inputs
+    unnecessary_inputs = user_inputs - model_inputs
+    if len(unnecessary_inputs):
+        for unnecessary_input in unnecessary_inputs:
+            print(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs")
+            del ort_inputs[unnecessary_input]
+
+    return ort_inputs
+
+
+# Add IO bindings for execution providers using OrtValue
+# Use when you need to run inference once or twice to save memory
+def add_io_bindings_as_ortvalues(
     model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict
 ):
     io_binding = model.io_binding()
@@ -318,3 +346,163 @@ def add_io_bindings(
             io_binding.bind_output(name, device_type=device, device_id=device_id)
 
     return io_binding, kv_cache_ortvalues
+
+
+# Add IO bindings for execution providers using PyTorch tensors
+# Use when you need to run inference many times
+def add_io_bindings_as_tensors(
+    model: InferenceSession, inputs: dict, outputs: dict, use_fp16: bool, use_buffer_share: bool
+):
+    # Verify model inputs
+    inputs = verify_ort_inputs(model, inputs)
+
+    device = None
+    pt_to_np = {
+        "torch.int32": np.int32,
+        "torch.int64": np.int64,
+        "torch.float16": np.float16,
+        "torch.float32": np.float32,
+    }
+
+    # Bind inputs/outputs to IO binding
+    io_binding = model.io_binding()
+    for k, v in inputs.items():
+        io_binding.bind_input(
+            name=k,
+            device_type=v.device.type,
+            device_id=0 if v.device.type == "cpu" else v.device.index,
+            element_type=pt_to_np[repr(v.dtype)],
+            shape=tuple(v.shape),
+            buffer_ptr=v.data_ptr(),
+        )
+        device = v.device
+
+    for output in model.get_outputs():
+        name = output.name
+        if use_buffer_share and "present" in name:
+            # Bind KV cache outputs to KV cache inputs
+            v = inputs[name.replace("present", "past_key_values")]
+            io_binding.bind_output(
+                name=name,
+                device_type=v.device.type,
+                device_id=v.device.index,
+                element_type=np.float16,
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+        else:
+            v = outputs[name]
+            io_binding.bind_output(
+                name=name,
+                device_type=device.type,
+                device_id=0 if device.type == "cpu" else device.index,
+                element_type=(np.float16 if use_fp16 else np.float32),
+                shape=tuple(v.shape),
+                buffer_ptr=v.data_ptr(),
+            )
+
+    return io_binding
+
+
+# Get actual inputs when using real data (instead of sample data) and initialize outputs
+def get_initial_inputs_and_outputs(
+    config: AutoConfig,
+    tokenizer: AutoTokenizer,
+    requested_length: int,
+    prompt: list[str],
+    device: torch.device,
+    use_fp16: bool,
+    use_buffer_share: bool,
+    engine: str,
+):
+    tokenizer.pad_token = "[PAD]"
+    encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True)
+    torch_dtype = torch.float16 if use_fp16 else torch.float32
+
+    # input_ids:      pad token id is 0
+    # attention_mask: pad token id is 0
+    # position_ids:   pad token id is 1
+    input_ids = torch.tensor(encodings_dict["input_ids"], device=device, dtype=torch.int64)
+    attention_mask = torch.tensor(encodings_dict["attention_mask"], device=device, dtype=torch.int64)
+    position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    # Check if tokenized prompt length matches the requested prompt length
+    tokenized_length = input_ids.shape[-1]
+    if tokenized_length > requested_length:
+        # Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids = input_ids[:, :requested_length]
+        attention_mask = attention_mask[:, :requested_length]
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+    elif tokenized_length < requested_length:
+        # Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length)
+        input_ids_first_col = input_ids[:, 0].unsqueeze(0).T
+        attention_mask_first_col = attention_mask[:, 0].unsqueeze(0).T
+        for _ in range(requested_length - tokenized_length):
+            input_ids = torch.hstack((input_ids_first_col, input_ids))
+            attention_mask = torch.hstack((attention_mask_first_col, attention_mask))
+        position_ids = get_position_ids(attention_mask, use_past_kv=False)
+
+    tokenized_length = input_ids.shape[-1]
+    assert tokenized_length == requested_length
+
+    # Create inputs
+    inputs = {
+        "input_ids": input_ids.contiguous() if engine == "ort" else input_ids,
+        "attention_mask": attention_mask.contiguous() if engine == "ort" else attention_mask,
+        "position_ids": position_ids.contiguous() if engine == "ort" else position_ids,
+    }
+    if engine != "ort":
+        inputs["past_key_values"] = []
+
+    # Get shape of KV cache inputs
+    batch_size, sequence_length = input_ids.shape
+    max_sequence_length = config.max_position_embeddings
+    num_heads = config.num_key_value_heads
+    head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+
+    # Create KV cache inputs
+    for i in range(config.num_hidden_layers):
+        past_key = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        past_value = torch.zeros(
+            batch_size,
+            num_heads,
+            max_sequence_length if use_buffer_share else 0,
+            head_size,
+            device=device,
+            dtype=torch_dtype,
+        )
+        if engine == "ort":
+            inputs.update(
+                {
+                    f"past_key_values.{i}.key": past_key.contiguous(),
+                    f"past_key_values.{i}.value": past_value.contiguous(),
+                }
+            )
+        else:
+            inputs["past_key_values"].append((past_key, past_value))
+
+    outputs = None
+    if engine == "ort":
+        # Create outputs
+        logits = torch.zeros(batch_size, sequence_length, config.vocab_size, device=device, dtype=torch_dtype)
+        outputs = {"logits": logits.contiguous()}
+        if not use_buffer_share:
+            for i in range(config.num_hidden_layers):
+                present_key = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                present_value = torch.zeros(
+                    batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype
+                )
+                outputs.update(
+                    {f"present.{i}.key": present_key.contiguous(), f"present.{i}.value": present_value.contiguous()}
+                )
+
+    return inputs, outputs
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index f41a90208c51b..9cbc9af7fe9b5 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 from __future__ import annotations
 
 import argparse
@@ -10,7 +15,7 @@
 from benchmark_helper import setup_logger
 from dist_settings import get_rank, get_size
 from llama_inputs import (
-    add_io_bindings,
+    add_io_bindings_as_ortvalues,
     convert_inputs_for_ort,
     get_merged_sample_with_past_kv_inputs,
     get_sample_inputs,
@@ -123,7 +128,7 @@ def verify_parity(
 
     # Add IO bindings for non-CPU execution providers
     if args.execution_provider != "cpu":
-        io_binding, kv_cache_ortvalues = add_io_bindings(
+        io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues(
             ort_model,
             inputs,
             args.execution_provider,
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 89b459c80beec..d570e2d7ee086 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import logging
 import os
 
diff --git a/onnxruntime/python/tools/transformers/models/llama/prompts.json b/onnxruntime/python/tools/transformers/models/llama/prompts.json
new file mode 100644
index 0000000000000..5d8fae99dbc7e
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/models/llama/prompts.json
@@ -0,0 +1,11 @@
+{
+    "16": "How are astronauts launched into space quickly on those rockets? ",
+    "64": "Today, we will learn how to bake a chocolate cake. First, you need to have all of the ingredients to bake. Otherwise, the chocolate cake won't be tasty. You will also need a large baking pan to hold the batter. ",
+    "256": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to ",
+    "1024": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to assess the level of risk associated with potential customers and determine how much they should charge them for coverage. For example, if you are a healthy 25-year old male who doesn't smoke and has never been in an accident, your insurance premiums will likely be lower than those of someone else who fits into one or more of these categories (or all three). Risk Management & Insurance is the process by which you can protect yourself from financial loss. It's about taking control of your money and making sure that it's safe, secure and accessible to you when you need it most. The first step in risk management is understanding what risks are important to you as an individual or a family member who may depend on the income generated by these investments for their livelihood. Once you have identified these key risk factors, then we can help identify how best to manage them through various strategies such as setting up automatic payments into savings accounts so that money is always available when needed most; setting aside emergency funds in case something unexpected happens (e.g., illness); investing wisely so that returns outpace inflation over time; diversifying portfolios by adding stocks and bonds which will help reduce volatility while still providing growth potential through dividends/interest payments over longer periods of time than if invested solely into one type of asset class alone etc. The field of risk management and insurance is growing rapidly, as more people become aware of the potential dangers that can arise from an unforeseen event or accident. As a result, there are many different careers within this field that you may want to consider if you're interested in working with risks and helping others protect themselves from them.One common career path in risk management is as an insurance agent/broker. This person would work for an insurance company or brokerage firm, selling policies to clients who need coverage against things like car accidents or home damage caused by natural disasters such as fires or floods. Insurance agents typically work on commission (i.e., they receive a percentage of every sale). This is important because it means that the more successful an agent is at selling policies, the higher his/her income will be. Another career option within risk management is working for an insurance company itself rather than as an external broker or salesperson. In this case, you'd help manage claims made by policyholders who have been injured through no fault of their own (for example after being hit by another driver). You can also work in risk analysis, a field that involves analyzing the potential risks associated with various investments and projects. This is done to determine whether or not an opportunity has enough upside to justify taking on any related risks. In addition, you might also be responsible for developing strategies to minimize those risks so they don't result in big losses if something goes wrong down the road. If your goal is to work as a broker or agent, then there are some prerequisites that will need to be met before beginning this career path: You must have an associate's degree from an accredited college; pass an exam administered by state regulators (the Series 6) and/or complete additional training offered by professional organizations such as NAFA, which stands for National Association of Financial Advisors. After meeting these requirements, you'll then need to find employment at one or more insurance companies where they offer positions that allow new hires some flexibility when starting out their careers.Risk management and insurance is a broad field that includes many different types of jobs. ",
+    "2048": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. In conclusion, we must use AI responsibly. ",
+    "3840": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Semi-supervised learning is a type of machine learning that falls in between supervised and unsupervised learning. It is a method that uses a small amount of labeled data and a large amount of unlabeled data to train a model. The goal of semi-supervised learning is to learn a function that can accurately predict the output variable based on the input variables, similar to supervised learning. However, unlike supervised learning, the algorithm is trained on a dataset that contains both labeled and unlabeled data. Semi-supervised learning is particularly useful when there is a large amount of unlabeled data available, but it\u2019s too expensive or difficult to label all of it. The primary advantage of semi-supervised learning is that it can reduce the amount of annotated data used. This is particularly useful when labeled data is scarce or expensive to obtain. By using a small amount of labeled data and a large amount of unlabeled data, semi-supervised learning algorithms can learn from both types of data and improve their accuracy. Semi-supervised learning algorithms are also capable of consolidating overfitting tendencies, which is a common problem in supervised learning. Another advantage of semi-supervised learning is that it is versatile. It can be applied in various situations, from image recognition to crawlers. For example, in text classification, the goal is to classify a given text into one or more predefined categories. Semi-supervised learning can be used to train a text classification model using a small amount of labeled data and a large amount of unlabeled text data. In image classification, the goal is to classify a given image into one or more predefined categories. Semi-supervised learning can be used to train an image classification model using a small amount of labeled data and a large amount of unlabeled image data. In anomaly detection, the goal is to detect patterns or observations that are unusual or different from the norm. Semi-supervised learning can be used to detect anomalies using a small amount of labeled data and a large amount of unlabeled data. Semi-supervised learning algorithms are also stable and simple. They have high efficiency and can be used to improve the performance and generalization of models. However, semi-supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, semi-supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, semi-supervised learning is a powerful tool that can be used to improve the accuracy and generalization of machine learning models. It is particularly useful when labeled data is scarce or expensive to obtain. Semi-supervised learning algorithms can learn from both labeled and unlabeled data, which makes them versatile and capable of consolidating overfitting tendencies. However, semi-supervised learning algorithms also have some disadvantages, such as requiring a large amount of unlabeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, semi-supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Supervised learning is a type of machine learning that involves training a model on labeled data. The goal of supervised learning is to learn a function that can accurately predict the output variable based on the input variables. Supervised learning is widely used in various fields, including image recognition, speech recognition, natural language processing, and more. One of the primary advantages of supervised learning is that it allows for accurate predictions. Supervised learning models can provide highly accurate predictions or classifications when trained on a diverse and representative dataset. This makes supervised learning particularly useful in situations where accuracy is critical, such as in medical diagnosis or fraud detection. Another advantage of supervised learning is that it is easy to understand and implement. Supervised learning algorithms are relatively simple and can be implemented using a variety of programming languages and libraries. This makes it accessible to a wide range of developers and data scientists. Supervised learning is also versatile. It can be applied to a wide range of problem domains, making it a flexible approach for various industries and applications. For example, in image classification, the goal is to classify a given image into one or more predefined categories. Supervised learning can be used to train an image classification model using a labeled dataset of images and their corresponding categories. In speech recognition, the goal is to transcribe spoken words into text. Supervised learning can be used to train a speech recognition model using a labeled dataset of audio recordings and their corresponding transcriptions. Supervised learning algorithms are also capable of handling missing data. If there is missing data in the labeled dataset, supervised learning algorithms can still learn from the available data and make accurate predictions. This is particularly useful in situations where data is incomplete or noisy. However, supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of labeled data to be effective. If there is not enough labeled data available, the algorithm may not be able to learn effectively. Additionally, supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, supervised learning is a powerful tool that can be used to make accurate predictions and classifications. It is easy to understand and implement, and it is versatile enough to be applied to a wide range of problem domains. However, supervised learning algorithms also have some disadvantages, such as requiring a large amount of labeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Unsupervised learning is a type of machine learning that involves training a model on unlabeled data. The goal of unsupervised learning is to learn the underlying structure of the data, without any prior knowledge of the output variable. Unsupervised learning is widely used in various fields, including image recognition, natural language processing, and more. One of the primary advantages of unsupervised learning is that it can handle large amounts of unlabeled and unstructured data. This makes unsupervised learning particularly useful in situations where labeled data is scarce or expensive to obtain. By using unsupervised learning algorithms, we can learn from the available data and make accurate predictions. Another advantage of unsupervised learning is that it can identify previously undetected patterns in data. Unsupervised learning algorithms can be used to cluster data points into groups based on their similarities. This can be useful in various applications, such as customer segmentation, anomaly detection, and more. Unsupervised learning algorithms are also capable of dimensionality reduction. This is particularly useful when dealing with high-dimensional data, such as images or text. By reducing the dimensionality of the data, unsupervised learning algorithms can improve the efficiency and accuracy of the model. Unsupervised learning algorithms are also capable of feature learning. Feature learning is the process of automatically learning features from the input data. This can be useful in various applications, such as image recognition, where the algorithm can learn features such as edges, corners, and more. However, unsupervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, unsupervised learning algorithms can be sensitive to the quality of the data. If the data is noisy or incorrect, the algorithm may not be able to learn effectively. As you can see, artificial intelligence (AI) is a wide-ranging field that encompasses various sub-fields. Some of the sub-fields that we have previously discussed include reinforcement learning, multi-modal learning, semi-supervised learning, supervised learning, unsupervised learning, and much more. There are also many application domains for artificial intelligence (AI) that can utilize it. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence (AI) will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. What do you think artificial intelligence will bring to our society? Will it be used in a responsible manner? ",
+    "4096": "In the heart of Eldoria, where ancient forests whispered secrets and rivers sang forgotten melodies, lay the Enchanted Labyrinth. Its walls, adorned with shimmering runes, concealed a portal to realms unknown. Few dared to venture inside, for the labyrinth was said to twist time and reality. Evelyn, a curious young mage, stood before the labyrinth's entrance. Her emerald eyes sparkled with determination. She clutched a cracked map, its ink fading like memories lost to the wind. Legends spoke of a treasure hidden deep within - a relic capable of granting any wish. As Evelyn stepped across the threshold, the air thickened. The walls shifted, rearranging themselves. She followed the faint glow of her lantern, each step echoing through eternity. Shadows danced, whispering forgotten names. Was this a dream or a nightmare? Deeper into the labyrinth, Evelyn encountered Aelar, the Guardian of Time. His silver hair flowed like moonlight, and his eyes held the weight of centuries. Aelar barred her path, his staff crackling with energy. 'Seeker,' he intoned, 'answer my riddle, and the way shall open.' Evelyn's heart raced. 'Ask, Guardian.' 'What has roots as old as time, yet dances with the wind?' She pondered, memories of her grandmother's tales flooding her mind. 'A tree,' she replied. Aelar smiled, and the walls shifted once more. 'Proceed, Seeker.' The labyrinth twisted, revealing a moonlit grove. Trees hummed ancient lullabies, and fireflies wove constellations in the air. At the center stood a weeping willow, its branches brushing the ground like a grieving widow's veil. Evelyn approached, her fingers tracing the bark. 'Why do you weep?' The willow's voice, soft as falling petals, answered, 'I guard the Tear of Eternity.' Evelyn's breath caught. The Tear - a gem said to hold memories of lost civilizations. She plucked it from a low branch, its facets reflecting forgotten faces. As Evelyn pressed onward, the labyrinth tightened its grip. She faced illusions - lovers lost, friends betrayed. Doubt gnawed at her resolve. Was the treasure worth the cost? At the labyrinth's heart, she found a mirror. Her reflection wavered, revealing her deepest desire: her sister, Lysandra, who vanished years ago. Tears blurred the glass. 'Speak your wish,' the mirror whispered. Evelyn's voice trembled. 'Bring Lysandra back.' The mirror shattered, and reality fractured. Lysandra stepped through, eyes wide with wonder. 'Evelyn?' Lysandra's return came at a cost - the labyrinth demanded balance. For every wish granted, a memory faded. Evelyn watched as her childhood laughter dissolved like mist. Together, they exited the labyrinth, the Tear pulsing in Evelyn's palm. She gazed at her sister, both joy and sorrow in her eyes. 'Was it worth it?' Lysandra asked. Evelyn smiled. 'In Eldoria, every choice we make becomes a story. And ours, dear sister, is woven in stardust and sacrifice.' And so, the Enchanted Labyrinth whispered its final secret: Wishes are threads, and memories their loom. In the land of Aetherfall, where mist-clad mountains touched the heavens and rivers whispered forgotten spells, a prophecy echoed through time. It spoke of the Starstone, a gem said to hold the universe's secrets - the key to creation and destruction. Eldric, a humble blacksmith with eyes like storm clouds, stumbled upon an ancient map. Its ink had faded, but the constellations remained. Guided by fate, he set forth, leaving his forge behind. Eldric's journey led him to the Whispering Forest, where trees conversed in hushed tones. Their leaves whispered of hidden paths and treacherous guardians. Eldric's heart pounded as he stepped into the shadows. There, he met Lyria, a forest nymph with silver hair and eyes like moonlit pools. She guarded the first clue - a riddle etched into a petal: 'In the heart of the forest, where time bends, seek the Wellspring of Echoes. There, the Starstone awaits.' Eldric followed Lyria's guidance. The Wellspring lay within a moon-kissed glade. Its waters shimmered, reflecting memories of lost lovers, ancient battles, and forgotten oaths. Eldric dipped his hand, and the riddle unfolded: 'To find the Starstone, seek the Three Keys: the tear of a fallen star, the breath of a dragon, and the song of a forgotten bard.' Eldric climbed the Stardust Peaks, where fallen stars lay embedded in the rock. Each tear held a fragment of cosmic sorrow. He found one - a sapphire gem pulsing with celestial fire. But it was guarded by Drakor, the last of the star dragons. Drakor's scales shimmered like galaxies. His eyes held eons of wisdom. 'Why seek the Tear, mortal?' 'To save Aetherfall,' Eldric replied. 'To restore balance.' Drakor nodded, and with a breath, he shattered the gem. Eldric caught the falling tear - a shard of eternity. Next, Eldric sailed to the Isle of Shadows, where the void whispered secrets. There, he faced Nyxia, the ancient shadow dragon. Her wings spanned continents, and her breath could devour stars. 'Why seek my breath?' Nyxia hissed. 'To awaken the Starstone,' Eldric said. 'To mend the rifts.' Nyxia's eyes glowed. She exhaled - a stream of darkness. Eldric captured it in a crystal vial - the Breath of the Void. The final key lay in the Bard's Hollow, where echoes of lost melodies lingered. Eldric met Silvan, a ghostly minstrel who strummed a lute of moonwood. 'Sing,' Silvan urged. 'The Song of the Forgotten.' Eldric sang of battles, love, and sacrifice. The hollow trembled, and from the mist, a spectral harp appeared. Its strings hummed - the Song of Ages. Eldric plucked the notes, and they merged into a silver key - the Song of the Forgotten. At the Nexus of Worlds, Eldric assembled the keys - the Tear, the Breath, and the Song. The ground quaked, and the Starstone emerged - a gem of cosmic hues. Its light wove reality, mending fractures in Aetherfall. But the prophecy held a twist: the Starstone demanded a choice. Eldric could use it to reshape the world or sacrifice it to heal the void. He gazed at Lyria, Drakor, Nyxia, and Silvan - their fates intertwined. With a heavy heart, he whispered, 'Balance.' And so, the Starstone shattered, its fragments seeding new constellations. Eldric returned to his forge, but his hammer now shaped more than iron - it forged destiny. Lyria, the Forest Nymph Lyria, with her silver hair and eyes like moonlit pools, remained in the Whispering Forest. She became its guardian, weaving spells to protect the ancient trees. Her laughter echoed through the glades, and travelers whispered of a nymph who danced with moonbeams. Lyria's heart held a secret - the memory of Eldric's touch, the warmth of their shared quest. She tended to the Wellspring of Echoes, ensuring its waters flowed through time, carrying whispers of forgotten tales. Drakor, the Last Star Dragon Drakor, the last of the star dragons, retreated to the highest peak of the Stardust Peaks. There, he curled his immense form around the shattered Tear of the Fallen. His scales absorbed its cosmic fire, and he became a living constellation - a beacon for lost souls. Drakor's breath no longer consumed stars; instead, it birthed new constellations. Travelers gazed at the night sky, seeking guidance in his patterns. Drakor's eyes held both sorrow and hope, for he knew that balance required sacrifice. Nyxia, the Ancient Shadow Dragon Nyxia, with wings spanning continents, chose a different path. She descended to the Isle of Shadows, where the void whispered secrets. There, she guarded the Abyss of Remembrance - a rift between worlds. Nyxia's breath no longer devoured stars; it sealed the rifts. She became a bridge, allowing souls to traverse realms. Those who sought lost loved ones or glimpses of forgotten memories found solace in her shadowed embrace. Nyxia's eyes held the weight of choices made and unmade, and she vowed to keep the balance intact. Silvan, the Ghostly Minstrel Silvan, the spectral minstrel, wandered the Bard's Hollow. His lute of moonwood sang melodies of love, loss, and courage. Silvan's song echoed through time, touching hearts across Aetherfall. He became the keeper of memories - the forgotten bard who whispered forgotten names. When travelers stumbled upon the hollow, Silvan strummed his lute, and their own stories surfaced. He wove their experiences into the Song of Ages, ensuring that no tale would fade into oblivion. Silvan's translucent form danced in moonlight, a bridge between the living and the departed. Eldric, the Blacksmith As for Eldric, the humble blacksmith, he returned to his forge in the village of Hearthstone. His hammer now shaped more than iron - it forged destiny. Eldric crafted talismans from the Tear of the Fallen, the Breath of the Void, and the Song of the Forgotten. These talismans healed rifts, mended broken hearts, and ignited hope. Eldric's eyes held the wisdom of realms explored, and he knew that Aetherfall's balance rested on the choices of ordinary souls. He continued to tell the tale of the Starstone, passing it down through generations, ensuring that the magic endured. And so, dear reader, the threads of fate intertwined - a forest nymph, a star dragon, a shadow, and a minstrel - all bound by the echoes of a forgotten song. The Chronicles of the Celestial Weaver In the forgotten village of Astralis, where the night sky wept silver tears, lived a young girl named Elara. Her eyes held the secrets of constellations, and her fingers danced like stardust. But Astralis suffered - a curse had befallen the heavens. The stars dimmed, their brilliance fading. Elara's grandmother, Lyris, whispered of an ancient prophecy: 'When the stars falter, seek the Celestial Weaver.' Elara vowed to unravel the mystery and save her village. Guided by Lyris's map, Elara ventured into the Veiled Forest, where moonlight wove through ancient oaks. There, she met Silas, the enigmatic weaver. His loom hummed with cosmic threads - the Loom of Eternity. 'Seek the lost constellations,' Silas said. 'Weave them anew.' Elara's heart raced. She plucked a silver thread - the remnants of Orion - and began to weave. The loom responded, stars rekindling. But the cost was memory - Elara forgot her childhood laughter. Elara's journey spanned realms: The Nebula Caves: She retrieved the Pleiades, their sisterhood echoing through time. The Comet's Trail: She chased Halley's Comet, capturing its fiery tail. The Abyss of Lyra: There, Vega's song echoed - a melody of love and longing. Each constellation restored, Elara's memories faded. She forgot her first kiss, her mother's lullabies. Yet Astralis glimmered - the stars brightened. In the Celestial Citadel, Elara faced Draco, the fallen dragon. His scales bore scars - the price of rebellion. He guarded the final constellation - the Serpent. 'Why weave the stars?' Draco hissed. 'They betrayed me.' Elara's fingers trembled. 'To save my village.' Draco's eyes softened. 'We were once kin. We'll share this memory.' As Elara wove the Serpent, she glimpsed Draco's love for Lyris - their forbidden bond. The constellation blazed, and Elara remembered both love and sacrifice. Back in Astralis, the stars blazed anew. Villagers rejoiced, but Elara's memories were fragile threads. Lyris embraced her. 'You've woven fate,' Lyris said. 'But the Loom demands balance.' Elara faced Silas. 'What price?' He smiled - a constellation of wrinkles. 'Your memories or the stars.' Elara hesitated. She remembered her grandmother's stories, her stolen kisses. She chose the stars. Elara became the new Celestial Weaver. Her memories - her life - wove into the cosmos. Astralis thrived, but Elara forgot her name, her laughter, her love. Lyris whispered, 'Weavers are forgotten, but their constellations endure.' And so, Elara wove - the forgotten girl who stitched eternity. Elara, now the Celestial Weaver, wove constellations with threads of memory. Astralis thrived - the villagers danced under starlit skies, unaware of their forgotten histories. Lyris watched her granddaughter, her eyes both proud and sorrowful. 'Elara,' Lyris whispered, 'the Loom demands more than memories.' Elara's fingers trembled. She glimpsed her own reflection in the cosmic threads - the girl who once dreamed of love and laughter. But now, her past was a constellation of faded stars. Silas, the former weaver, lingered in the shadows. His form blurred - a specter between realms. He spoke of the Whispering Veil, a boundary separating memory from oblivion. Beyond it lay forgotten worlds, lost loves, and forbidden truths. 'Cross the Veil,' Silas urged. 'Retrieve what was sacrificed.' Elara hesitated. She yearned for her stolen memories - the taste of strawberries, the warmth of a lover's touch. But the Veil was treacherous - a labyrinth of half-remembered echoes. Elara stepped into the Veil. Its mist clung to her skin, whispering secrets. She glimpsed fragments of her past - a stolen kiss, a tear shed for a fallen friend. The path forked: The Garden of Remembrance: Blooming with forgotten faces, this garden promised reunion. Elara could reclaim her lost memories, but at a cost - the stars would dim once more. The Abyss of Oblivion: A chasm of emptiness. Here, Elara could sever her ties to Astralis, becoming a true Celestial Weaver. The stars would blaze forever, but her existence would be a threadless void. Elara hesitated. She remembered Lyris's lullabies, Silas's enigmatic smile, and Draco's love for her grandmother. She yearned for her stolen laughter - the taste of strawberries, the warmth of a lover's touch. But the stars - Astralis - called to her. The village thrived, its people dancing under constellations she had rekindled. Elara's choice would echo across eternity. She faced the Veil's center - a mirror reflecting her fragmented self. Her fingers trembled. 'Balance,' she whispered. And so, Elara wove anew. She plucked threads from the Garden of Remembrance, reclaiming stolen moments. The stars dimmed, but Astralis glowed with forgotten love. Silas nodded. 'You've chosen well, Weaver.' Elara's memories returned - the taste of strawberries, the warmth of a lover's touch. She kissed Lyris's forehead, whispered Draco's name, and stepped back into Astralis. The stars blazed - the legacy of a girl who stitched eternity. Short stories like these are great to listen and read because they allow us to explore our creative minds and broaden our imaginations. They also inspire us to learn from others and can become culturally impactful. The themes of these stories can also dive deep into philosophical questions and raise awareness for important issues. The plots for these stories are sometimes based on real life events as well and can have deep emotional impact.",
+    "7936": "The Effects of Airplanes: A Closer Look Airplanes have revolutionized the way we travel, connect, and explore the world. From short domestic flights to transcontinental journeys, these metal birds have become an integral part of our lives. However, their impact extends beyond convenience and adventure. Let's delve into the effects of airplanes from various angles. Environmental Impact Fuel Consumption and Emissions Airplanes consume vast amounts of fuel during flight. For instance, a Boeing 747, with a gas tank capacity of 63,500 gallons, burns approximately five gallons of jet fuel per mile traveled. On a 4,000-mile flight, this translates to 20,000 gallons of fuel. However, when we consider the number of passengers (around 400), the fuel efficiency per traveler is surprisingly better than that of cars. A Honda Civic, which gets 30 miles per gallon, would need 133 gallons of fuel for the same distance. Even an RV, which moves just seven miles on a gallon of gasoline, would require about 285 gallons per traveler. Greenhouse Gas Emissions Airplanes emit greenhouse gases directly into the upper atmosphere, where they can linger longer and cause more damage than the same gases at lower altitudes. While air travel contributes to climate change, it's essential to recognize that other forms of transportation, such as cars and ships, also emit greenhouse gases. The challenge lies in finding ways to reduce aviation emissions without compromising connectivity and mobility. Ozone Depletion and Contrails Planes affect the concentration of other gases and pollutants in the atmosphere. They lead to a short-term increase in ozone (O3) but a long-term decrease. Contrails - those white streaks left behind by planes - can contribute to cloud formation and impact local weather patterns. Balancing the benefits of air travel with environmental concerns remains a critical challenge. Human Health Implications Jet Lag and Sleep Disruption Frequent flyers are no strangers to jet lag. Crossing time zones disrupts our circadian rhythms, affecting sleep patterns, mood, and overall well-being. Pilots, flight attendants, and passengers alike experience the effects of rapid travel across time zones. Dehydration and Blood Pressure Changes The low humidity in airplane cabins can lead to dehydration. Additionally, changes in cabin pressure affect blood pressure, especially during takeoff and landing. Staying hydrated and moving around during long flights can mitigate these effects. Risk of Contagious Diseases Airplanes put passengers in close proximity to one another. Recirculated air, shared surfaces, and confined spaces create an environment conducive to the spread of infections. While airlines take precautions, travelers should remain vigilant, especially during flu seasons. The Perspective Shift: Seeing Earth from Above Beyond the environmental and health impacts, airplanes have transformed our worldview. Before the Wright brothers' epochal breakthrough, humans were grounded, limited to terrestrial views. The advent of flight not only boosted our power of movement but also enhanced our vision. From above, we witness the curvature of the Earth, the vastness of oceans, and the intricate patterns of landscapes. Airplanes have made us global citizens, connecting us to distant lands and cultures. In conclusion, airplanes are a double-edged sword. They offer unparalleled mobility and exploration but come with environmental consequences and health considerations. As we continue to innovate and improve aviation technology, let's strive for a balance - a world where we soar through the skies while safeguarding our planet and well-being. Economic Impact Air Travel Industry The aviation industry is a significant contributor to the global economy. Airlines, airports, manufacturers, and associated services generate substantial revenue and employment. Air travel facilitates international trade, tourism, and business interactions. However, it also faces challenges such as fuel price fluctuations, competition, and regulatory complexities. Supply Chain and Cargo Transport Airplanes play a crucial role in transporting goods across continents. High-value and time-sensitive cargo, including perishable items, pharmaceuticals, and electronics, rely on air freight. The efficiency of supply chains owes much to the speed and reach of airplanes. Tourism and Local Economies Tourism heavily depends on air travel. Popular destinations thrive due to the influx of visitors arriving by plane. Local economies benefit from tourism-related activities, including hospitality, restaurants, and souvenir shops. Conversely, overreliance on tourism can strain natural resources and cultural heritage. Technological Advancements Aerospace Engineering The development of airplanes has driven advancements in aerospace engineering. Innovations in materials, aerodynamics, and propulsion systems have led to more efficient and safer aircraft. Research in areas like supersonic flight, electric planes, and autonomous drones continues to shape the industry. Navigation and Communication Airplanes rely on sophisticated navigation systems, including GPS, radar, and inertial guidance. These technologies enhance safety, accuracy, and efficiency. Communication networks allow pilots to stay connected with air traffic control, other planes, and ground stations. Social and Cultural Effects Global Connectivity Airplanes have transformed our perception of distance. What once took weeks by ship or months by land can now be accomplished in hours. Families separated by oceans reunite, students study abroad, and cultural exchange flourishes. The world feels smaller, and our interconnectedness grows. Iconic Symbols Airplanes evoke a sense of wonder and adventure. The iconic silhouettes of jumbo jets, fighter planes, and vintage biplanes symbolize human achievement and exploration. Airshows, aviation museums, and historical flights celebrate this legacy. Challenges and Future Prospects Sustainability The aviation industry faces the challenge of reducing its environmental impact. Researchers explore alternative fuels, electric propulsion, and lightweight materials. Balancing growth with sustainability remains critical. Airspace Congestion As air travel becomes more accessible, airspace congestion intensifies. Efficient air traffic management, improved routes, and next-generation air traffic control systems are essential to prevent gridlock. Security and Safety Ensuring the safety of passengers, crew, and cargo remains paramount. Rigorous security protocols, maintenance standards, and emergency preparedness are vital. In conclusion, airplanes are more than mere vessels of transportation. They shape economies, connect cultures, and inspire innovation. As we soar into the future, let's navigate the skies responsibly, appreciating both the marvels and challenges of flight. The Effects of Space Travel on the Human Body Space travel, with its awe-inspiring vistas and boundless possibilities, has captivated humanity for decades. However, venturing beyond our home planet comes with a price - a price paid not only in technological challenges but also in the toll it takes on the human body. Let us explore the effects of space travel, from radiation exposure to altered gravity, and how astronauts adapt to these extreme conditions. Space Radiation: A Silent Threat Radiation Exposure On Earth, our protective magnetic field and atmosphere shield us from the majority of space radiation. However, in space, astronauts face direct exposure to cosmic rays and solar particles. These high-energy particles can penetrate the body, damaging cells and DNA. Increased risk of cancer and degenerative diseases, such as heart disease and cataracts, have been observed in human populations exposed to radiation on Earth. In space, health risks from radiation are mainly driven by long-term impacts. Altered Gravity: A Weighty Matter Microgravity and Muscle Atrophy Astronauts aboard the International Space Station (ISS) experience microgravity, where their bodies float freely. While this weightlessness allows for breathtaking experiments and observations, it wreaks havoc on muscles and bones. Without the constant pull of gravity, muscles weaken, and bones lose density. Astronauts must engage in rigorous exercise routines to counteract muscle atrophy and maintain bone health. Fluid Redistribution and Swollen Faces In microgravity, bodily fluids shift upward, causing facial puffiness and fluid retention. Astronauts often joke about their 'moon faces.' This fluid redistribution can also affect vision, leading to a condition known as spaceflight-associated neuro-ocular syndrome (SANS). Isolation and Confinement: The Mental Strain Psychological Challenges Space missions involve prolonged isolation and confinement. Astronauts live in tight quarters, cut off from the natural world. The absence of familiar sights, sounds, and smells can lead to feelings of loneliness and anxiety. Coping mechanisms, communication with loved ones, and psychological support are crucial to maintaining mental well-being. Distance from Earth: A Cosmic Solitude Emotional Impact The vastness of space can evoke existential thoughts. Astronauts gaze back at Earth - a tiny blue dot suspended in the cosmic void - and grapple with their insignificance. The emotional weight of being far from home, family, and friends can be profound. Hostile and Closed Environments: Surviving in the Void Spacecraft Living Conditions Spacecraft are marvels of engineering, but they are also confined capsules. Astronauts adapt to tight spaces, recycled air, and limited privacy. The constant hum of machinery and the absence of natural light can wear on their senses. Risk of Infection In closed environments, microbes thrive. Astronauts must maintain strict hygiene to prevent infections. The immune system faces unique challenges, especially during extended missions. The Resilience of Astronauts Adaptation and Innovation Astronauts are remarkable in their ability to adapt. They learn to navigate microgravity, perform complex tasks, and troubleshoot technical glitches. Their resilience drives innovation, leading to better spacecraft design and life support systems. The Twin Study: Scott and Mark Kelly Scott Kelly and his identical twin brother, Mark Kelly, participated in the unique Twins Study. Scott spent nearly a year aboard the ISS, while Mark remained on Earth. By comparing their physiological and psychological changes, researchers gained valuable insights into the effects of space travel. Looking Ahead: Mars and Beyond Challenges for Deep Space Missions As we plan for Mars missions and beyond, we face the RIDGE of space travel: Space Radiation: Shielding astronauts from cosmic rays. Isolation and Confinement: Maintaining mental health during long journeys. Distance from Earth: Coping with cosmic solitude. Gravity Fields: Addressing muscle and bone health. Hostile/Closed Environments: Ensuring safety and hygiene. In conclusion, space travel is a delicate balance between exploration and preservation. As we venture farther into the cosmos, we must safeguard both our scientific curiosity and the well-being of those who dare to explore the final frontier. The Environmental Impact of Airplanes and Spaceships Airplanes and spaceships have transformed the way we explore our planet and beyond. However, their operations come with significant environmental consequences. Let's delve into the effects of these flying machines on our delicate ecosystem. Climate Change Air travel is a major contributor to climate change due to greenhouse gas emissions. Jet engines burn fossil fuels (mostly aviation gasoline or jet fuel), releasing carbon dioxide (CO2), nitrogen oxides (NOx), and water vapor into the atmosphere. These emissions trap heat, leading to global warming. Although aviation accounts for about 3.5 percent of human-induced climate change, its impact is disproportionately high due to emissions at high altitudes. Air Quality Airplanes emit pollutants such as sulfur dioxide (SO2), particulate matter (PM), and volatile organic compounds (VOCs). These pollutants degrade air quality near airports and along flight paths. Ground-level ozone formation, which harms human health and ecosystems, is also influenced by aviation emissions. Noise Pollution The roar of jet engines disrupts communities around airports. Noise pollution affects sleep patterns, stress levels, and overall well-being. Efforts to reduce noise include quieter engine designs and flight path adjustments. Spaceships: Earth's Atmospheric Guardians Rocket Launches and Pollution Rocket launches, essential for space exploration, release pollutants into the atmosphere. The fuel used - such as unsymmetrical dimethylhydrazine (UDMH) - can be highly carcinogenic and ecologically damaging. For instance, the Baikonur Cosmodrome in Kazakhstan, the world's oldest spaceport, has left a large zone of pollution due to toxic rocket fuel seeping into the soil. Carbon Particles and Geo-Engineering Recent research highlights the impact of rocket emissions on the atmosphere. Black carbon (soot) particles from rockets can absorb heat, acting as a form of geo-engineering. As commercial space launches increase, so does the concern about their environmental effects. Balancing Exploration and Preservation Space Tourism The rise of space tourism introduces new challenges. As more people venture beyond Earth, we must consider the cumulative impact of rocket emissions. Balancing our curiosity with environmental stewardship is crucial. Sustainable Practices Efforts are underway to develop cleaner propulsion technologies, use alternative fuels, and minimize space debris. Innovations like reusable rockets and electric propulsion aim to reduce the environmental footprint of space travel. Looking Ahead: A Cosmic Responsibility Mars and Beyond As we dream of Mars colonies and interstellar travel, we must tread carefully. The RIDGE of space exploration - Radiation, Isolation, Distance, Gravity, and Environment - requires sustainable solutions. Let's explore the cosmos while safeguarding our home planet. In conclusion, airplanes and spaceships propel us toward the stars, but their effects ripple through our atmosphere and ecosystems. As stewards of both Earth and space, we must navigate the skies responsibly, seeking harmony between exploration and preservation. From the ground to the sky, dining experiences have transcended traditional restaurant settings. Imagine savoring gourmet meals while suspended high above the earth, with breathtaking views stretching as far as the eye can see. Welcome to the world of aerial dining, where culinary delights meet gravity-defying elegance. Dinner in the Sky: Elevating Gastronomy The Original Concept Dinner in the Sky, born in 2006, is the epitome of dining with a twist. Picture a massive table - more like a platform - hoisted almost 200 feet into the air by a sturdy crane. Guests, chefs, and waitstaff don their white hats as they ascend to the skies. The setting? A floating dinner table, surrounded by nothing but open air and panoramic vistas. The Experience As you settle into your seat, the anticipation builds. The restaurant staff orchestrates a three-course fine dining experience, all while suspended in midair. The menu features carefully crafted dishes, often prepared beforehand and finished in a convection oven right there in the sky. Each bite is accompanied by awe-inspiring views - city skylines, rolling landscapes, or even the vastness of the ocean. Safety First Before you ascend, a safety briefing ensures that you're securely strapped in. The thrill of being airborne mingles with the elegance of haute cuisine. Whether it's a romantic date night or a corporate event, Dinner in the Sky promises an unforgettable meal. Sky-High Restaurants Around the World Dubai Marina: A Feast Above the Waters Situated in Dubai Marina, this dining concept boasts some of the best views of the city skyline, surrounding waters, and the iconic Palm Jumeirah. Imagine floating above the ground while you dine - a one-of-a-kind experience you simply cannot miss. After the safety briefing near Skydive Dubai, you're hoisted 50 meters into the air, suspended over the bustling marina. The fusion of flavors meets the fusion of horizons. Las Vegas: Unparalleled Views of the Strip In the entertainment capital of the world, Dinner in the Sky Las Vegas takes fine dining to new heights - literally. As the sun sets, you ascend, and the glittering lights of the Las Vegas Strip come alive. The most unforgettable dinner you'll ever have awaits, with the cityscape stretching out beneath you. It's a feast for the senses, where culinary artistry meets architectural marvels. The Future of Aerial Gastronomy Sustainability and Innovation As we look ahead, the challenge lies in balancing indulgence with environmental responsibility. How can we minimize the carbon footprint of these lofty dining experiences? Innovations like electric-powered cranes, locally sourced ingredients, and waste reduction strategies are steps toward a more sustainable future. Beyond Earth: Space Tourism and Cosmic Cuisine With the rise of space tourism, could we soon dine among the stars? Imagine a celestial restaurant aboard a spacecraft, overlooking Earth from orbit. Cosmic cuisine - crafted by zero-gravity chefs - might become the ultimate bucket-list experience. As we explore the cosmos, let's ensure that our gastronomic adventures leave no trace behind. In conclusion, dining in the air transcends mere sustenance. It's a celebration of human ingenuity, a fusion of flavors and vistas, and a reminder that our appetite for exploration knows no bounds. So, raise your glass (carefully!) to the skies and savor the magic of dining aloft. Dining in the Sky is a unique and exhilarating culinary experience that elevates traditional dining to new heights - literally. Here are the key aspects of this extraordinary concept: The Setting: Up, Up, and Away! Imagine being seated at a massive table suspended high above the ground, often hundreds of feet in the air. The dining platform is typically hoisted by a sturdy crane or other mechanical means. Guests, chefs, and waitstaff ascend together, creating an unforgettable communal experience. The Experience: A Feast with a View As you settle into your seat, anticipation builds. The thrill of being airborne mingles with the elegance of haute cuisine. The menu features carefully crafted dishes, often prepared beforehand and finished on-site. Whether it's breakfast, lunch, or dinner, each course is served against a backdrop of breathtaking views - city skylines, rolling landscapes, or even the vastness of the ocean. The floating table becomes a stage for culinary artistry, where flavors dance amidst the clouds. Safety First: Buckle Up! Before ascending, guests receive a safety briefing. Straps secure them to their seats, ensuring a worry-free dining experience. The focus shifts from gravity to gastronomy as the platform rises, leaving the ground far below. Locations Around the World: Where the Sky Meets the Plate Dubai Marina: Suspended above the bustling marina, diners enjoy views of the city skyline and the iconic Palm Jumeirah. Las Vegas: As the sun sets, guests ascend over the glittering lights of the Las Vegas Strip, creating an unparalleled dining spectacle. The Future: Sustainability and Cosmic Cuisine Balancing indulgence with environmental responsibility is crucial. Innovations like electric-powered cranes and locally sourced ingredients aim to reduce the carbon footprint. Could cosmic cuisine be next? With the rise of space tourism, imagine dining aboard a spacecraft, overlooking Earth from orbit. Zero-gravity chefs crafting celestial dishes - it's a tantalizing prospect. Introduction The sky, our celestial canvas, is a dynamic theater where cosmic phenomena unfold. From twinkling stars to majestic planets, the sky offers a mesmerizing display that captivates astronomers and dreamers alike. In this essay, we'll explore the various elements of celestial weather, from meteor showers to planetary alignments. Stars and Constellations Stellar Climates Stars, like earthly weather patterns, exhibit their own 'climates.' Some stars burn fiercely, radiating intense heat, while others are cooler and more temperate. The constellations, those celestial neighborhoods, form intricate patterns across the night sky. Imagine them as cosmic weather maps, guiding our eyes to distant realms. Meteor Showers: Celestial Rainfall Meteor showers are cosmic storms, where Earth passes through debris left behind by comets. As these tiny particles collide with our atmosphere, they ignite, creating streaks of light - the meteors. The Perseids in August and the Geminids in December are celestial fireworks, painting the sky with ephemeral beauty. Planets and Their Dance Planetary Weather Systems Our solar system hosts a diverse range of planets, each with its own atmospheric conditions. Venus, shrouded in thick clouds of sulfuric acid, experiences hurricane-force winds. Mars, with its rusty surface, battles dust storms that engulf the entire planet. Jupiter's Great Red Spot - a colossal storm - has raged for centuries. Conjunctions and Oppositions Planets engage in a cosmic ballet. Conjunctions occur when two planets appear close together in the sky, as if sharing a celestial embrace. Oppositions, on the other hand, position a planet directly opposite the Sun, making it visible all night. Witnessing Mars during opposition feels like meeting an old friend. Lunar Weather Phases of the Moon The Moon, Earth's faithful companion, cycles through its phases. New Moon, First Quarter, Full Moon - the lunar weather changes predictably. During a lunar eclipse, our planet casts a shadow on the Moon, turning it coppery red. It's a cosmic reminder of our place in the grand celestial drama. Tides: The Ocean's Cosmic Response The Moon's gravitational pull orchestrates tides on Earth. High tides and low tides ebb and flow, responding to lunar cues. The celestial dance between Earth, Moon, and Sun shapes our oceans, affecting coastlines and marine life. Celestial Events Comets: Cosmic Visitors Comets, celestial vagabonds, journey through our solar system. Their icy cores release gas and dust, forming magnificent tails. Halley's Comet, a recurring visitor, graces our skies once every 76 years. Its return is a cosmic homecoming. Supernovae: Stellar Explosions When massive stars reach the end of their lives, they explode in brilliant supernovae. These cosmic fireworks outshine entire galaxies. Witnessing a supernova - a rare event - is like glimpsing the universe's raw power. Conclusion As we gaze upward, let's remember that the sky is not merely a backdrop but a living, breathing entity. Its weather - both familiar and otherworldly - shapes our cosmic experience. So, next time you look up, consider the celestial forecast: a blend of stardust, wonder, and infinite possibilities. In the words of Carl Sagan, 'The cosmos is within us. We are made of star-stuff.' Cosmic Mysteries Dark Matter and Dark Energy The sky harbors secrets beyond our comprehension. Among them are dark matter and dark energy. Dark matter, invisible and elusive, exerts gravitational influence on galaxies, holding them together. Imagine it as the cosmic glue binding the universe. Dark energy, on the other hand, accelerates the universe's expansion, pushing galaxies apart. These cosmic enigmas remain shrouded in mystery, awaiting discovery. Auroras: Celestial Light Shows When charged particles from the Sun collide with Earth's magnetic field, they create auroras - the ethereal dance of light near the poles. The Northern Lights (Aurora Borealis) and Southern Lights (Aurora Australis) paint the night sky with hues of green, pink, and purple. These celestial ballets remind us of our interconnectedness with the solar system. Celestial Timekeeping Stellar Clocks The sky serves as humanity's oldest timekeeper. Ancient civilizations relied on celestial events for calendars. The sidereal day, based on Earth's rotation relative to distant stars, is approximately 23 hours, 56 minutes, and 4 seconds. Constellations rise and set, marking the passage of time - a cosmic heartbeat. Eclipses: Celestial Alignments Solar and lunar eclipses are cosmic alignments. During a solar eclipse, the Moon obscures the Sun, casting a shadow on Earth. The eerie twilight and the diamond ring effect evoke awe. Lunar eclipses, when Earth's shadow engulfs the Moon, transform it into a reddish orb - an astronomical spectacle witnessed by civilizations across millennia. Cosmic Harmony Music of the Spheres Ancient philosophers believed in the 'music of the spheres.' They imagined celestial bodies - planets, stars, and moons - emitting harmonious vibrations. Each celestial note contributed to a cosmic symphony. While we no longer hear this celestial music, its metaphorical resonance persists - a reminder that the universe hums with hidden melodies. Galactic Weather Patterns Galaxies, like weather systems, evolve. Spiral galaxies, with their graceful arms, resemble cosmic hurricanes. Elliptical galaxies, shaped like celestial footballs, harbor dormant black holes at their cores. Colliding galaxies create celestial tempests, birthing new stars. The cosmic weather forecast predicts galactic collisions, stellar births, and cosmic winds. Conclusion: Our Cosmic Home As we conclude our cosmic odyssey, remember that the sky is not an abstract canvas - it's our celestial home. Whether you're stargazing from a mountaintop or contemplating the Moon's craters, you participate in the grand cosmic narrative. The sky whispers tales of creation, destruction, and eternity. So, dear reader, look up. Embrace the celestial weather - the storms and serenades. For in the vastness of space, we find wonder, humility, and a shared cosmic kinship. As Carl Sagan eloquently put it, 'We are a way for the cosmos to know itself.' Introduction The universe is a symphony, and planets are its celestial notes. These enigmatic orbs dance around stars, weaving tales of creation, destruction, and cosmic balance. In this essay, we embark on a cosmic journey to explore the eight planets of our solar system and their profound significance. Mercury: The Swift Messenger Mercury, the swiftest planet, orbits closest to the Sun. Its surface is a rugged landscape of craters and cliffs, baked by scorching temperatures during the day and chilled at night. Named after the Roman messenger god, Mercury shuttles between extremes, delivering cosmic messages across the solar system. Venus: Earth's Fiery Twin Venus, Earth's twin sister, hides behind thick clouds of sulfuric acid. Its surface resembles a volcanic inferno, with temperatures hot enough to melt lead. Yet, its beauty lies in its radiant glow - the Morning and Evening Star - illuminating our dawn and dusk. Earth: Our Blue Gem Earth, our precious home, teems with life. Its oceans, forests, and deserts form a delicate biosphere. From the icy poles to the equatorial rainforests, Earth's diverse climates sustain a symphony of ecosystems. We are its guardians, entrusted with its care. Mars: The Red Planet's Mysteries Mars, the Red Planet, beckons explorers. Its rusty surface bears ancient river valleys and polar ice caps. Could Mars harbor hidden reservoirs of life? Robotic rovers traverse its deserts, seeking answers beneath its crimson skies. Jupiter: King of the Gas Giants Jupiter, the colossal gas giant, boasts a mesmerizing tapestry of bands and storms. Its Great Red Spot - a tempest larger than Earth - has raged for centuries. Jupiter's gravitational pull shapes the solar system, protecting inner planets from cosmic debris. Saturn: Jewel of the Rings Saturn, adorned with majestic rings, is a cosmic jewel. These icy hoops, composed of countless particles, create a celestial ballet. Saturn's moons - Titan, Enceladus, and others - beckon us to explore their icy landscapes. Uranus: The Original Ice Giant Uranus, tipped on its side, spins like a cosmic top. Its icy blue hue conceals turbulent storms. Uranus remains a mystery, awaiting further study by future missions. Neptune: The Farthest Wanderer Neptune, shrouded in azure clouds, is the outermost planet. Its winds whip at supersonic speeds, and its icy heart harbors storms that rival Jupiter's. Voyager 2, our interstellar traveler, captured Neptune's beauty as it sailed past. Conclusion: Cosmic Harmony Planets are cosmic harmonizers. Their gravitational dances sculpt orbits, stir tides, and guide comets. They remind us of our place in the grand cosmic orchestra. As we gaze at the night sky, let us cherish these celestial companions - the guardians of harmony. In the words of Carl Sagan, 'We are made of star-stuff.' Our existence echoes the cosmic rhythm, and planets are our celestial partners in this cosmic waltz. Pluto, once considered our ninth planet, now holds the title of a dwarf planet. The International Astronomical Union (IAU) made this reclassification in 2006. Pluto didn't meet one of the three criteria the IAU uses to define a full-sized planet: it has not cleared its neighboring region of other objects. Despite its demotion, Pluto remains a fascinating member of the Kuiper belt, a ring of bodies beyond Neptune's orbit. It is the ninth-largest and tenth-most-massive known object to directly orbit the Sun. Although smaller than Earth's moon, Pluto's icy and rocky composition continues to intrigue astronomers and stargazers alike. NASA's New Horizons mission is a remarkable endeavor that has expanded our understanding of the outer reaches of our solar system. Let's delve into the details of this pioneering spacecraft: Objective: New Horizons was designed to study the dwarf planet Pluto, its moons, and other objects in the Kuiper Belt. Launch Date: On January 19, 2006, New Horizons embarked on its epic journey. Spacecraft Mass: Weighing 1,054 pounds (478 kilograms), it carried a suite of scientific instruments. Mission Design and Management: The mission was led by NASA in collaboration with the Johns Hopkins University Applied Physics Laboratory (APL). Historic Flyby: On July 14, 2015, New Horizons made history by becoming the first spacecraft to explore Pluto up close. It captured stunning images of Pluto's diverse geological features, including its icy plains, rugged mountains, and frozen canyons. Moons of Pluto: During the flyby, New Horizons also studied Pluto's five moons, including the intriguing Charon. Arrokoth Flyby: In early 2019, New Horizons achieved another milestone by flying past Arrokoth (2014 MU69). Arrokoth is a Kuiper Belt Object, making it the most distant object ever explored up close. Kuiper Belt: This region extends from about 30 AU (near Neptune's orbit) to about 50 AU from the Sun. New Horizons ventured into this uncharted territory. New Horizons carried an impressive array of instruments, including: Ralph: A visible and infrared imager/spectrometer. Alice: An ultraviolet imaging spectrometer. Radio-Science Experiment (REX): Studied radio signals. Long-Range Reconnaissance Imager (LORRI): Captured high-resolution images. Solar Wind and Plasma Spectrometer (SWAP): Analyzed solar wind. Pluto Energetic Particle Spectrometer Science Investigation (PEPSSI): Studied particles around Pluto. Student Dust Counter (SDC): Measured dust impacts. New Horizons provided insights into Pluto's atmosphere, surface, and geology. It revealed icy mountains, glaciers, and mysterious dark regions. The spacecraft also observed Jupiter's moons (Io, Europa, and Ganymede) during its long journey. As of 2023, New Horizons continues to explore the outer solar system, contributing to our understanding of distant bodies. In summary, New Horizons has been a trailblazer, revealing the secrets of Pluto and venturing into the cosmic frontier. Its legacy inspires future missions and fuels our curiosity about the cosmos. ",
+    "8192": "Once upon a time, in a quaint little village nestled amidst rolling hills, there existed an old teapot. But this was no ordinary teapot; it was a magical one. Its handle curved just so, and its spout seemed to whisper secrets to the wind. The villagers called it 'Elara,' and they believed it held the power to grant wishes. Elara sat on the windowsill of Mrs. Abernathy's cozy cottage. Mrs. Abernathy was a kind-hearted woman with twinkling eyes and a penchant for herbal teas. She'd inherited the teapot from her grandmother, who, in turn, had received it from a mysterious traveler. One chilly evening, as the sun dipped below the horizon, Mrs. Abernathy brewed her favorite chamomile tea. She poured the fragrant liquid into Elara, and to her astonishment, the teapot began to glow. The room filled with a soft, golden light, and Mrs. Abernathy felt a tingle in her fingertips. 'Make a wish,' whispered Elara, its spout quivering. Mrs. Abernathy hesitated. She'd heard tales of wishes gone awry - of greedy desires leading to unintended consequences. But her heart yearned for something simple: a garden filled with blooming roses. So, she closed her eyes and wished for just that. The next morning, Mrs. Abernathy stepped outside, and her breath caught. The air smelled of roses - sweet and heady. But when she looked around, she gasped. Her modest garden had transformed into a riot of colors. Roses of every hue - crimson, ivory, apricot - bloomed in profusion. They climbed the walls, twined around the picket fence, and even spilled onto the cobblestone path. Word spread throughout the village, and soon everyone wanted a turn with Elara. The baker wished for the perfect sourdough loaf, and it appeared in his oven. The blacksmith wished for strength, and his arms bulged with newfound muscle. The schoolteacher wished for wisdom, and her lectures became captivating tales. But as wishes multiplied, so did the consequences. The baker's sourdough grew sentient and demanded to be called 'Doughbert.' The blacksmith's strength made him accidentally crush his anvil. And the schoolteacher's wisdom led her to question the very fabric of reality. Mrs. Abernathy watched with a mix of amusement and concern. Elara seemed to thrive on granting wishes, but its porcelain surface bore faint cracks. Was it growing weaker? One day, a young girl named Lily approached Elara. Her eyes sparkled with innocence, and she clutched a dandelion in her hand. 'Teapot,' she said, 'I wish for a friend.' Elara hesitated. It sensed the purity of Lily's heart, but it also knew the weight of loneliness. With a shudder, it granted the wish. And so, Lily's dandelion transformed into a giggling sprite named Petal. They danced through meadows, shared secrets, and became inseparable. Elara's cracks deepened, but it didn't mind. As seasons passed, Mrs. Abernathy sat by the window, watching Elara fade. Yet, she felt no regret. For in granting wishes, the teapot had found purpose. And perhaps, just perhaps, it had one final wish left - to be remembered. And so, when Mrs. Abernathy's time came, she whispered to Elara, 'Thank you.' The teapot glowed one last time, and Mrs. Abernathy drifted away, leaving behind a garden of roses and a village full of stories. And that, my dear reader, is how the enchanted teapot became a legend - a vessel of magic, love, and wishes granted with a fragile heart. As the seasons changed, so did the village. The once-sleepy hamlet now buzzed with visitors from distant lands. They came seeking Elara, the legendary teapot that granted wishes. Some sought riches, others fame, but most yearned for something deeper - a connection to the mystical. Among the newcomers was a weary traveler named Ezra. His cloak was tattered, and his boots bore the marks of countless miles. He'd heard whispers of Elara's magic and hoped it could mend his broken heart. For Ezra had lost his beloved, and grief weighed upon him like an anchor. Mrs. Abernathy, now an old woman with silver hair, welcomed Ezra into her cottage. Elara sat on the windowsill, its porcelain surface etched with memories. Mrs. Abernathy poured chamomile tea into the teapot, and it glowed faintly, as if recognizing an old friend. 'Make a wish,' Mrs. Abernathy said, her voice soft. Ezra hesitated. His wish was simple yet profound: to see his love once more, if only in a dream. He closed his eyes and whispered, 'I wish for a single night with her.' Elara trembled, its spout quivering. It understood the ache of lost love - the longing that transcended time. And so, it granted Ezra's wish. That night, as the moon hung low in the sky, Ezra lay on Mrs. Abernathy's creaky bed. Elara sat beside him, its glow illuminating the room. He drifted into slumber, and there, in the realm between wakefulness and dreams, he found himself in a moonlit garden. His love, Isolde, stood before him. Her eyes were the color of forget-me-nots, and her laughter echoed like wind chimes. They danced beneath a silver canopy, twirling through memories - their first kiss, stolen moments by the river, promises whispered under ancient oaks. But dreams are fragile, and dawn approached. Isolde's form wavered, and Ezra clung to her. 'Stay,' he pleaded. 'Just a little longer.' Isolde smiled, her touch like a butterfly's kiss. 'Time bends here,' she said. 'But you must wake, my love.' As the sun peeked over the horizon, Ezra opened his eyes. Elara sat on the windowsill, its glow fading. Mrs. Abernathy watched him, her gaze knowing. 'Did you see her?' she asked. Ezra nodded, tears glistening. 'She was real, Mrs. Abernathy. I held her again.' The village marveled at Ezra's tale - the man who danced with a ghost. They flocked to Elara, each with their wishes. The blacksmith wished for forgiveness, the baker for inspiration, and the schoolteacher for courage. Elara obliged, its cracks deepening, but it never complained. One day, as winter painted the landscape white, Mrs. Abernathy grew frail. She called Ezra to her bedside. 'Elara's magic wanes,' she whispered. 'But it has one final wish.' Ezra knelt beside her. 'What is it?' 'Take Elara beyond the hills,' Mrs. Abernathy said. 'To the ancient oak where Isolde and I carved our initials. There, bury the teapot. It will become part of the earth, and its magic will seep into the roots.' And so, on a frost-kissed morning, Ezra carried Elara to the oak. He dug a small hole, placed the teapot inside, and covered it with soil. As he patted the ground, he felt a tremor - a farewell. The next spring, the oak bloomed with roses - crimson, ivory, apricot. And in its shade, a dandelion sprouted. Its petals glowed like moonlight, and when the wind whispered, it carried echoes of laughter. Ezra knew then that Elara's wish had come true. It had become part of the land, woven into the fabric of stories. And perhaps, just perhaps, it still listened, granting silent wishes to those who believed. And so, the legend of Elara lived on - a teapot turned earth, a vessel of love, and a bridge between worlds. In the heart of the Whispering Forest, where ancient trees leaned close and their leaves murmured secrets, lived a young girl named Evelyn. She had eyes the color of moss and hair that tangled like wild vines. Evelyn was no ordinary child; she could hear the forest's whispers - the soft rustle of leaves, the creaking of branches, and the laughter of unseen creatures. The villagers feared the Whispering Forest. They said it was cursed - a place where time flowed differently, where shadows danced with mischief, and where lost souls wandered forever. But Evelyn felt drawn to its heart. She believed the forest held answers - about her missing parents, about the world beyond the village. One moonlit night, when the forest beckoned with silver fingers, Evelyn slipped away from her tiny cottage. She wore a cloak spun from spider silk and carried a lantern that glowed like a captured star. The trees leaned in, their bark etched with ancient runes. They whispered her name - Evelyn, Evelyn - as if they knew her purpose. Deeper she ventured, past gnarled roots and dew-kissed ferns. The air smelled of moss and memories. The lantern's light flickered, casting eerie shadows on the forest floor. And then, she heard it - the melody of the Whispering Forest. It was a haunting tune, sung by unseen lips, and it tugged at her heart. 'Who are you?' Evelyn whispered. The forest answered - a chorus of voices, overlapping and harmonizing. 'We are the echoes of forgotten dreams, the guardians of lost paths. Seek what you desire, but beware the price.' Evelyn pressed on. She reached a clearing where moonflowers bloomed - a sea of pale petals that glowed like fallen stars. In their midst stood a stone pedestal, and atop it rested a silver key. It was unlike any key she'd seen - twisted and delicate, with a single emerald set in its bow. The whispers intensified. 'Take the key,' they urged. 'Unlock the door to your destiny.' Evelyn hesitated. What door? What destiny? She thought of her parents - their laughter, their scent of pine and adventure. They'd vanished when she was a baby, leaving only a crumpled map with cryptic symbols. With trembling fingers, she picked up the key. It felt warm, alive. And then, she saw it - a door, half-hidden behind an ancient oak. Its wood was etched with constellations, and its handle bore the same emerald as the key. Evelyn inserted the key into the lock. The door groaned open, revealing a tunnel - a ribbon of darkness that wound deeper into the forest. The whispers grew urgent. 'Step through, Evelyn. Find your truth.' She stepped into the tunnel, and the world shifted. Time blurred, and she glimpsed her parents - laughing, dancing, fading like smoke. The tunnel led to a chamber - a celestial cavern where stars swirled in liquid patterns. And there, on a stone pedestal, lay a crystal vial. The whispers crescendoed. 'Drink,' they urged. 'Remember.' Evelyn uncorked the vial. Memories flooded her - the scent of pine, her parents' laughter, the taste of adventure. Tears blurred her vision. She drank, and the forest embraced her - a cocoon of whispers, of love, of belonging. When Evelyn emerged, the Whispering Forest had changed. It no longer whispered of curses but sang of hope. She carried her parents' memories - their legacy - and vowed to protect the forest's secrets. And so, Evelyn became the new guardian. She tended the moonflowers, listened to the trees, and sang the haunting melody. The villagers no longer feared the forest; they sought its solace, its magic. And every night, as the moon rose, Evelyn stood by the ancient oak. She whispered her parents' names, and the forest whispered back - a lullaby woven from stardust and love. Beyond the Whispering Forest, where the moonflowers bloomed and the stars whispered secrets, lay a forgotten path. It was a narrow trail, overgrown with moss and guarded by ancient stones. Few dared to tread there, for it led to the Compass Grove. Lysander, a young cartographer with ink-stained fingers and a heart full of wanderlust, stumbled upon this path one misty morning. His boots sank into damp earth, and the air smelled of pine and possibility. He carried a tattered map - a relic passed down through generations. Its edges bore cryptic symbols, and its center held a blank space - an uncharted territory. The Compass Grove was said to house a mystical compass - the Wayfinder's Compass - forged by the first explorers. It was no ordinary instrument; it pointed not to north, but to one's true desire. Legends whispered that whoever held the compass could navigate not only the physical world but also the labyrinth of their own heart. Lysander's pulse quickened. He yearned for adventure - to map uncharted lands, to unravel mysteries. His parents had vanished during an expedition, leaving behind a single clue: the blank space on the map. Perhaps the Compass Grove held answers. As he pushed through brambles and ferns, the forest seemed to guide him. Sunlight filtered through leaves, casting dappled patterns on the ground. And then, he saw it - a circle of ancient stones, their surfaces etched with symbols. At the center stood a pedestal, and atop it rested the Wayfinder's Compass. Lysander's breath caught. The compass was unlike any he'd seen. Its needle shimmered like a captured star, and its dial bore not cardinal directions but enigmatic words: Dreams, Regret, Destiny, and Hope. He touched the compass, and it hummed - a vibration that resonated in his bones. The whispers began - the voices of long-lost explorers, of forgotten dreams. 'Choose,' they urged. 'Choose your path.' Lysander hesitated. Dreams? Regret? Destiny? Hope? Each word held a promise, a peril. He thought of his parents - their laughter, their courage. He thought of the blank space on the map - the uncharted territory that beckoned. And so, he turned the dial to Dreams. The needle quivered, then settled - a path leading deeper into the forest. Lysander followed, lantern in hand, heart pounding. The compass guided him past silver streams and ancient oaks. It led him to a hidden waterfall - a curtain of moonlight that shimmered like stardust. There, he glimpsed a figure - a woman with eyes like forgotten constellations. She wore a cloak spun from spider silk, and her hair flowed like a river. 'Lysander,' she said, her voice a melody. 'You seek dreams.' He nodded. 'I seek answers. About my parents.' The woman touched his forehead, and memories flooded him - the scent of pine, his parents' laughter, the taste of adventure. 'Dreams are maps,' she said. 'They guide us beyond what we see.' Lysander understood. Dreams were compasses of the soul. His parents had followed theirs, and now he would follow his. He stepped through the waterfall, and the world shifted. He found himself on a cliff overlooking a vast sea - a sea of blank parchment. Islands floated in the distance, waiting to be charted. Lysander unrolled his map - the one with the blank space - and dipped his quill. He drew coastlines, marked mountains, and named each land. And as he mapped, the compass glowed - a beacon of dreams fulfilled. Lysander knew then that he was not merely a cartographer; he was a dreamweaver. His parents' legacy flowed through him - their courage, their laughter, their love. And so, Lysander sailed the uncharted seas, guided by the Wayfinder's Compass. He discovered islands of forgotten myths, forests of whispered tales, and cities where stars danced in the streets. He wrote his own story - a cartography of dreams. And in the Compass Grove, the ancient stones whispered his name - Lysander, Lysander - as if they knew he'd found his true north. In the heart of the city, where cobblestone streets wound like forgotten memories, stood an abandoned mansion. Its windows were boarded up, and ivy clung to its crumbling walls. But within those decaying walls lay a secret - a clockwork garden. Evelyn, a curious girl with eyes like rain-kissed petals, discovered the mansion one rainy afternoon. She wore mismatched socks and carried a notebook filled with sketches - a testament to her love for hidden wonders. The mansion's gate creaked open, and Evelyn stepped into a world frozen in time. The clockwork garden was unlike any other. Its flowers were made of gears and springs, their petals unfolding with precise clicks. The roses ticked, the daffodils whirred, and the tulips chimed. And at the center stood a colossal mechanical tree - its branches reaching toward the sky, its leaves spinning like miniature windmills. Evelyn gasped. She'd read about clockwork wonders - the automatons that danced at royal balls, the pocket watches that whispered secrets. But this garden was alive - a symphony of metal and magic. As she explored, she noticed a silver key embedded in the tree's trunk. It gleamed, beckoning her. Evelyn hesitated. What did the key unlock? And why had the clockwork garden been abandoned? The flowers seemed to whisper. 'Unlock the tree,' they urged. 'Discover its heart.' Evelyn turned the key. The tree shuddered, and its branches parted, revealing a hidden chamber. Inside, a mechanical heart pulsed - a delicate contraption of brass and crystal. It hummed, resonating with the rhythm of forgotten time. And then, she heard it - the voice of the tree. 'I am Chronos,' it said. 'Guardian of moments.' Evelyn's heart raced. 'Moments?' 'Every petal, every leaf,' Chronos explained. 'They hold memories - the laughter of lovers, the tears of parting, the whispers of dreams. But time has fractured. The clockwork garden is frozen, and I am fading.' Evelyn understood. The mansion's former owner - a clockmaker named Lysander - had built this garden to capture fleeting moments. But Lysander had vanished, leaving Chronos incomplete. 'I can mend you,' Evelyn said. 'But why was the garden abandoned?' Chronos sighed - a sound like winding gears. 'Lysander sought eternity. He believed that by freezing time, he could preserve love, prevent loss. But he forgot that life thrives in impermanence.' Evelyn touched the mechanical heart. 'Can we fix it?' Chronos nodded. 'You must find Lysander's final creation - the Celestial Gear. It lies beyond the city, where the river meets the stars.' And so, Evelyn embarked on her quest. She followed the river, past moonlit bridges and forgotten docks. The Celestial Gear awaited - a constellation of interlocking wheels, its center a pulsing light. As she placed the gear in Chronos's heart, the clockwork garden stirred. Flowers bloomed, petals unfurling with joy. The mechanical tree's leaves spun faster, and time flowed once more. But Chronos grew weaker. 'I am bound to this place,' it said. 'My purpose fulfilled.' Evelyn wept. 'Can't you come with me?' Chronos smiled - a clockwork smile. 'I am part of the garden now. But you, dear Evelyn, carry its memory.' And so, she returned to the mansion, where the clockwork garden thrived. She sketched its wonders, capturing gears and petals on paper. And when she closed her eyes, she heard the whispers - the laughter of lovers, the tears of parting, the echoes of dreams. Evelyn became the new guardian. She tended the flowers, wound the tree, and listened to Chronos's fading heartbeat. And every night, as the stars wheeled overhead, she whispered her thanks. For in the heart of the clockwork garden, time danced - a fragile waltz of moments, preserved and cherished. In the heart of the Astronomer's Quarter, where cobblestone streets wound like celestial paths, stood an ancient observatory. Its domed roof bore the scars of countless meteor showers, and its telescopes whispered secrets to the night sky. But within those hallowed walls lay a mystery - a forgotten constellation. Aria, a young stargazer with eyes like distant galaxies, discovered the observatory one moonless night. She wore a cloak spun from stardust and carried a pocket-sized atlas - a testament to her love for the heavens. The observatory's door creaked open, and Aria stepped into a world woven with cosmic threads. The forgotten constellation was unlike any other. Its stars were elusive, their positions shifting with each passing century. Astronomers had once mapped it - a celestial tapestry of myth and memory - but over time, its name faded, its stories lost. As Aria explored, she noticed a silver quill resting on an ancient star chart. Its nib gleamed, beckoning her. Aria hesitated. What secrets did the quill hold? And why had the forgotten constellation slipped from memory? The stars seemed to whisper. 'Write,' they urged. 'Illuminate the night.' Aria dipped the quill in ink. The constellations above shifted - a celestial dance awaiting completion. She traced the forgotten lines - the Hunter's Bow, the Weaver's Loom, the Lost Lyre. And then, she saw it - a gap in the sky, a void where a constellation once blazed. The quill hummed - a vibration that resonated in her bones. The whispers intensified. 'Remember,' they urged. 'Remember the story.' And so, Aria wrote - a tale woven from stardust and longing. She penned the forgotten constellation's name: Lyra's Veil. Its stars had once guided lovers across oceans, inspired poets to verses, and cradled dreams in their luminous arms. But Lyra's Veil had vanished - a casualty of time's relentless march. Its stories faded, its purpose lost. Aria vowed to restore it - to stitch the celestial fabric, thread by thread. She climbed to the observatory's rooftop, where telescopes pointed toward infinity. Aria gazed at the sky, her breath mingling with the Milky Way. And there, in the gap, she saw it - the faint glimmer of Lyra's Veil. The quill guided her. She drew the missing lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as she wrote, the stars responded. Lyra's Veil emerged - a constellation reborn. But Aria felt a pull - a cosmic yearning. She touched the quill to her heart, and memories flooded her - the scent of stardust, her grandmother's bedtime stories, the taste of wonder. 'Guard it,' whispered the stars. 'Guard Lyra's Veil.' And so, Aria became the new guardian. She tended the observatory, charted the skies, and whispered the forgotten stories. The astronomers marveled - the gap was gone, and Lyra's Veil blazed once more. But Aria knew her duty. She would write new tales - of love, of courage, of dreams stitched together. And every night, as the constellations wheeled overhead, she whispered her thanks. For in the heart of the forgotten constellation, time danced - a fragile waltz of memory, preserved and cherished. In the heart of the bustling city, where skyscrapers touched the clouds and neon signs flickered like distant stars, lived a forgotten runner named Evelyn. She wasn't famous like the sprinters on billboards or the marathon champions with their gleaming medals. No, Evelyn was an ordinary woman who ran for the sheer joy of it. Every morning, before the sun peeked over the horizon, Evelyn laced up her worn-out sneakers. She followed the same route - a loop around the park, past the fountain where pigeons bathed, and along the riverbank where willow trees whispered secrets. Her pace was steady, her breaths rhythmic. She ran not to win races but to escape the noise of life - to find solace in the rhythm of her footsteps. But the city had forgotten Evelyn. The sports channels didn't broadcast her runs, and the local newspapers didn't write about her achievements. She was a lone figure - a silhouette against the dawn, chasing dreams that no one else cared about. One chilly morning, as Evelyn jogged along the river, she noticed a poster taped to a lamppost. It announced the city's annual marathon - the grand event that drew elite athletes from around the world. Evelyn's heart skipped a beat. She'd never run a marathon, but the idea tugged at her like a distant constellation. She tore off the poster and studied it. The race would wind through the city's streets, past cheering crowds and historic landmarks. The finish line was the grand stadium - the same stadium where she'd watched her heroes cross the tape, their names echoing through the loudspeakers. Evelyn hesitated. She wasn't a professional runner. She didn't have a coach or a team. But something stirred within her - a longing to be part of the marathon, to leave her mark on the city she loved. And so, she trained. She woke earlier, ran farther, and pushed her limits. She practiced pacing, fueled by oatmeal and determination. The other runners didn't notice her - a middle-aged woman with graying hair - but Evelyn didn't mind. She was a comet streaking through the pre-dawn darkness, fueled by her own quiet fire. On marathon day, the city buzzed with excitement. The streets were lined with spectators - families with homemade signs, old couples in folding chairs, children waving tiny flags. The elite runners surged ahead, their strides effortless. But Evelyn was in the middle of the pack - a forgotten runner among thousands. As she crossed each mile marker, Evelyn felt a surge of pride. She wasn't breaking records, but she was breaking barriers - the ones she'd built around herself. The cheers of the crowd fueled her - their encouragement like solar winds pushing her forward. And then, at mile 20, exhaustion hit. Evelyn's legs wobbled, her breaths came in ragged gasps. She glanced at the grand stadium - the finish line shimmering like a distant galaxy. But her body rebelled. She wanted to collapse, to fade into anonymity. And that's when she saw him - a young boy with a crumpled sign. It read, 'Go, Evelyn! You're not forgotten.' Tears blurred her vision. She pushed through the pain, her heartbeat a metronome of determination. As Evelyn crossed the finish line, the crowd erupted. The loudspeakers blared her name - Evelyn, Evelyn - and the forgotten runner became a star. She collapsed into the arms of a volunteer, her legs trembling. But she'd done it. She'd run the marathon - the one that mattered to her. The newspapers wrote about her - the woman who defied odds, who ran not for glory but for love. And the city remembered Evelyn - the forgotten runner who'd become a constellation, lighting the way for others. Lysander stood at the finish line of the marathon, his chest heaving, sweat-soaked shirt clinging to his skin. The stadium roared - a symphony of applause and encouragement. But amidst the cheers, he felt a void - an ache that no medal could fill. He'd run the race - the one that mattered to him. Yet, as he caught his breath, Lysander wondered about the blank space on his map. The uncharted territory - the reason his parents had vanished - still haunted him. A shadow fell across the track. It was Evelyn, the forgotten runner. Her eyes sparkled with determination, and her worn-out sneakers bore the marks of countless miles. She'd finished the marathon too, her name echoing through the loudspeakers. 'Evelyn,' Lysander said, his voice hoarse. 'Why do we run?' She leaned against the railing, gazing at the city beyond. 'For the same reason we map,' she replied. 'To find what's lost.' Lysander nodded. 'The Compass Grove,' he said. 'The Wayfinder's Compass.' Evelyn's eyes widened. 'You know of it?' He traced the blank space on his map - the gap where the forgotten constellation should be. 'My parents sought it,' Lysander confessed. 'They believed it held answers - about time, about destiny.' Evelyn's fingers brushed the silver quill in her pocket. 'And did they find it?' He shook his head. 'They vanished. But I won't stop searching.' Together, they left the stadium - the forgotten runner and the cartographer. They followed the same path - the one that led beyond the city, into the Whispering Forest. The compass guided them - the needle pointing not to north, but to dreams. As they reached the ancient stones of the Compass Grove, Evelyn gasped. 'Look,' she said, her voice hushed. There, etched into the stones, were symbols - the Weaver's Loom, the Lost Lyre, and the Hunter's Bow. And at the center stood the pedestal - the Wayfinder's Compass. Lysander touched it - the needle quivering. 'What do we seek?' he asked. Evelyn's eyes held galaxies. 'Not just answers,' she said. 'But connection - to the forgotten, to each other.' And so, they turned the dial - to Hope. The compass hummed, and the forest whispered. A path opened - a ribbon of moonlight leading deeper. They stepped through, and the world shifted. Stars swirled - a celestial dance. And there, in the gap, they saw it - the forgotten constellation. Lyra's Veil blazed - a tapestry of memories, stitched by stardust. Its stars guided lovers, inspired poets, and cradled dreams. Lysander and Evelyn held hands - the cartographer and the runner. They traced the lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as they gazed at Lyra's Veil, they felt it - a cosmic yearning. Not for fame or medals, but for eternity - the kind woven into forgotten constellations. Together, they whispered their thanks - to the stars, to the forest, to each other. In the small town of Maplewood, basketball was more than a game - it was a way of life. The local high school gym, with its creaky wooden floors and flickering lights, held memories etched into the hearts of generations. Tommy Reynolds, a lanky teenager with dreams as big as the full moon, had grown up shooting hoops in that gym. His father, a former basketball star, had taught him the art of the game - the perfect arc of a jump shot, the rhythm of dribbling, and the magic of teamwork. But Tommy wasn't like his father. He lacked the height and the natural talent. Still, he practiced tirelessly, his sneakers squeaking on the polished floor. He'd stare at the faded championship banners hanging from the rafters - the ones his father had helped win - and imagine his own name there someday. Senior year arrived, and Tommy made the varsity team. He wasn't a star player, but he hustled, diving for loose balls and setting screens. The crowd cheered louder for the flashy slam dunks, but Tommy's heart beat for the fundamentals - the bounce pass, the defensive stance, the pick-and-roll. The state championship game loomed - a David-and-Goliath matchup against the undefeated Oakwood Tigers. They had a towering center, a lightning-fast point guard, and a reputation for crushing opponents. Maplewood was the underdog, the team with heart but not much else. As the final seconds ticked away, the score was tied. Tommy stood at center court, sweat dripping down his face. The gym seemed to hold its breath. He glanced at the banners - the ghosts of champions past urging him on. The ball found its way to Tommy. He dribbled, eyes scanning the court. His father's voice echoed in his mind: 'Trust your instincts, son.' He drove toward the basket, the Tigers' defense closing in. But instead of taking the shot, Tommy passed - the perfect bounce pass to his teammate, Danny. Danny leaped, releasing the ball just as the buzzer sounded. The gym erupted. The ball swirled through the net - a miracle shot that defied physics. Maplewood had won - the underdogs had toppled the giants. Tommy's teammates lifted him on their shoulders. The crowd chanted his name. But as he glanced at the banners, he knew the truth. It wasn't just his shot - it was the culmination of every bounce pass, every defensive stance, every pick-and-roll. His father hugged him - a rare display of emotion. 'You did it, Tommy,' he whispered. 'You made your mark.' And there, in the glow of victory, Tommy realized that sometimes the greatest miracles happen at center court - not in the spotlight, but in the quiet moments of practice, persistence, and heart."
+}
diff --git a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
index e8b563261001b..33084aec214c2 100644
--- a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
+++ b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py
@@ -1,3 +1,8 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
 import argparse
 
 import numpy as np
diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
index e8be98cbfc0e4..4148e63d58619 100644
--- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
+++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc
@@ -57,7 +57,7 @@ struct ATenOperator {
     c10::IValue i_value;
     // Create the torch tensor from this DLPack no matter we need it or not below,
     // so that the dlpack's deleter will be triggered when torch tensor is out of scope.
-    at::Tensor tensor = at::fromDLPack(dlpack);
+    at::Tensor tensor = at::fromDLPack(const_cast<DLManagedTensor*>(dlpack));
     switch (elem_kinds[index]) {
       case c10::TypeKind::TensorType: {
         i_value = is_optional ? c10::IValue(c10::optional<at::Tensor>(tensor)) : c10::IValue(tensor);
diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc
new file mode 100644
index 0000000000000..bab4e9a60e2ed
--- /dev/null
+++ b/onnxruntime/test/common/cuda_op_test_utils.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_CUDA
+#include "cuda_runtime_api.h"
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+int GetCudaArchitecture() {
+  // This will cache the result so we only call cudaGetDeviceProperties once.
+  // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result.
+  static int cuda_arch = -1;
+
+#ifdef USE_CUDA
+  if (cuda_arch == -1) {
+    int current_device_id = 0;
+    cudaGetDevice(&current_device_id);
+    // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
+    cudaDeviceSynchronize();
+    cudaDeviceProp prop;
+
+    // When cudaGetDeviceProperties fails, just return -1 and no error is raised.
+    // If cuda device has issue, test will fail anyway so no need to raise error here.
+    if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) {
+      cuda_arch = prop.major * 100 + prop.minor * 10;
+    }
+  }
+#endif
+
+  return cuda_arch;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 043e3059c38d7..6f3e460628566 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -4,37 +4,20 @@
 #pragma once
 
 #include "test/util/include/default_providers.h"
-#ifdef USE_CUDA
-#include "cuda_runtime_api.h"
-#endif
 
 namespace onnxruntime {
 namespace test {
 
+// CUDA architecture of the current device like 100 * major + 10 * minor.
+// Please call this function after CUDA EP is enabled.
+int GetCudaArchitecture();
+
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
 
-  if (min_cuda_architecture == 0) {
-    return true;
-  }
-
-  int cuda_architecture = 0;
-
-#ifdef USE_CUDA
-  int currentCudaDevice = 0;
-  cudaGetDevice(&currentCudaDevice);
-  cudaDeviceSynchronize();
-  cudaDeviceProp prop;
-  if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) {
-    return false;
-  }
-
-  cuda_architecture = prop.major * 100 + prop.minor * 10;
-#endif
-
-  return cuda_architecture >= min_cuda_architecture;
+  return GetCudaArchitecture() >= min_cuda_architecture;
 }
 
 inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) {
diff --git a/onnxruntime/test/common/trt_op_test_utils.h b/onnxruntime/test/common/trt_op_test_utils.h
new file mode 100644
index 0000000000000..a0b0b9bb1931f
--- /dev/null
+++ b/onnxruntime/test/common/trt_op_test_utils.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "test/common/cuda_op_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530
+inline const std::unordered_set<std::string> ExcludeTrtOnA100() {
+  // Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled.
+  // If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture.
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    return {kTensorrtExecutionProvider};
+  }
+
+  return {};
+}
+
+// Add TensorRT EP to an excluded provider list when running on A100
+inline const std::unordered_set<std::string>& ExcludeTrtOnA100(std::unordered_set<std::string>& excluded_providers) {
+  if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) {
+    excluded_providers.insert(kTensorrtExecutionProvider);
+    return excluded_providers;
+  }
+
+  return excluded_providers;
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc
index 2a56991ec5af4..061fffa572be2 100644
--- a/onnxruntime/test/contrib_ops/activation_op_test.cc
+++ b/onnxruntime/test/contrib_ops/activation_op_test.cc
@@ -50,11 +50,15 @@ TEST_F(ActivationOpTest, ParametricSoftplus) {
       {{"alpha", alpha}, {"beta", beta}}, {}, false);  // Disable TensorRT due to result mismatch
 }
 
+// [TODO] Temporarily ignore this test for OpenVINO
+// Fails due to accuracy mismatch
+#if !defined(USE_OPENVINO)
 TEST_F(ActivationOpTest, Gelu) {
   TestActivationOp<float>(
       "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2))); }, {},
       {}, false, 1, kMSDomain);
 }
+#endif
 
 #if defined(USE_DNNL)
 std::vector<BFloat16> expected_output_bfloat16(const std::vector<float>& input_data) {
diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc
index b652e0723f5aa..a8e2fccdd0462 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test.cc
@@ -227,6 +227,12 @@ static void RunAttentionTest(
       tester.AddOptionalInputEdge<int32_t>();
     }
 
+    if (use_float16) {
+      tester.SetOutputTolerance(0.005f);
+    } else {
+      tester.SetOutputTolerance(0.001f, 0.001f);
+    }
+
     if (enable_cuda) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -254,6 +260,9 @@ static void RunAttentionTest(
     if (enable_dml) {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
       execution_providers.push_back(DefaultDmlExecutionProvider());
+      if (use_float16) {
+        tester.SetOutputTolerance(0.02f);
+      }
       tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
     }
   }
@@ -2013,13 +2022,6 @@ TEST(AttentionTest, AttentionMaskIndexOutOfRange) {
 #if !defined(__wasm__)
 // TODO: fix in web assembly
 TEST(AttentionTest, AttentionPastState_dynamic) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping AttentionPastState_dynamic in A100 since TF32 is enabled";
-    return;
-  }
-
   // create rand inputs
   RandomValueGenerator random{};
 
@@ -2101,13 +2103,6 @@ static void RunModelWithRandomInput(
     std::vector<int32_t>& mask_index_data,
     std::string& onnx_model,
     bool is_float16) {
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  // Do not run this test unless TF32 is disabled explicitly.
-  if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault<int>("NVIDIA_TF32_OVERRIDE", 1) != 0) {
-    GTEST_SKIP() << "Skipping RunModelWithRandomInput in A100 since TF32 is enabled";
-    return;
-  }
-
   RandomValueGenerator random{234};
 
   constexpr int hidden_size = 768;
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 156ed3799fc22..6ce9f5de68f11 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -70,7 +74,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -161,7 +167,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -254,7 +262,9 @@ TEST(BeamSearchTest, GptBeamSearchWithInitDecoderFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
@@ -346,7 +356,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #ifdef USE_CUDA
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+    OrtCUDAProviderOptionsV2 cuda_options;
+    cuda_options.use_tf32 = false;
+    session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #endif
 
 #ifdef USE_ROCM
diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
index 88a2bdf6a4849..8a37ef921fd2b 100644
--- a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
@@ -31,10 +31,8 @@ static void RunAttentionTest(
     const std::vector<float>* new_value_cache = nullptr,
     const std::vector<float>* key_cache = nullptr,
     const std::vector<float>* value_cache = nullptr,
-    const std::initializer_list<bool>* key_padding_mask_data = nullptr,
-    bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
-  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+    const std::initializer_list<bool>* key_padding_mask_data = nullptr) {
+  bool enable_cuda = HasCudaEnvironment(0);
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   bool enable_cpu = false;
 
@@ -99,6 +97,7 @@ static void RunAttentionTest(
       tester.AddOutput<float>("new_key_cache", output_cache_dims, *new_key_cache);
       tester.AddOutput<float>("new_value_cache", output_cache_dims, *new_value_cache);
     }
+    tester.SetOutputTolerance(0.001f, 0.001f);
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index acaae2dcd9712..17c9e8592f64e 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -754,9 +754,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
 
     // Output(s)
     tester.AddOutput<float>("output", input_dims, output);
-
     tester.AddOutput<float>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.001f, 0.001f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -897,9 +898,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 
     // Output(s)
     tester.AddOutput<MLFloat16>("output", input_dims, output);
-
     tester.AddOutput<MLFloat16>("present", past_dims, present);
 
+    tester.SetOutputTolerance(0.005f);
+
     // Run - Regular kernel execution path
     {
       std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/fft_op_test.cc b/onnxruntime/test/contrib_ops/fft_op_test.cc
index 56a6466c760f6..7a6b6cca6425a 100644
--- a/onnxruntime/test/contrib_ops/fft_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fft_op_test.cc
@@ -25,6 +25,7 @@ TEST(ContribOpTest, Rfft) {
   // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward")
   test.AddInput<float>("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
   test.AddOutput<float>("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
@@ -45,6 +46,7 @@ TEST(ContribOpTest, Irfft) {
   test.AddAttribute("normalized", static_cast<int64_t>(0));
   test.AddInput<float>("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f});
   test.AddOutput<float>("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
index a24f3b6b441e1..d9d2681dd3b3f 100644
--- a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc
@@ -50,6 +50,8 @@ static void RunGemmFastGeluGpuTest(const std::vector<float>& input_data, const s
     tester.AddOutput<float>("Y", output_dims, output_data);
   }
 
+  tester.SetOutputTolerance(use_float16 ? 0.005f : 0.0025f);
+
   tester.Config(run_with_tunable_op)
       .RunWithConfig();
 }
@@ -154,7 +156,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithoutBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         false);
+                         false, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
@@ -189,7 +191,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) {
 
   RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data,
                          input_dims, weight_dims, bias_dims, output_dims,
-                         true);
+                         true, true);
 }
 
 TEST(GemmFastGeluTest, GemmFastGeluWithBias_bfloat16) {
diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc
index 1baf50c1ba616..73da82d4bb039 100644
--- a/onnxruntime/test/contrib_ops/greedy_search_test.cc
+++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -64,9 +68,13 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
@@ -145,9 +153,13 @@ TEST(GreedySearchTest, GptGreedySearchFp32) {
 
   if (is_cuda || is_rocm) {
     Ort::SessionOptions session_options;
+#ifdef USE_CUDA
     if (is_cuda) {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+      OrtCUDAProviderOptionsV2 cuda_options;
+      cuda_options.use_tf32 = false;
+      session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
     }
+#endif
     if (is_rocm) {
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0));
     }
diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc
index 46ed04301a9e8..d970178e29ab8 100644
--- a/onnxruntime/test/contrib_ops/gridsample_test.cc
+++ b/onnxruntime/test/contrib_ops/gridsample_test.cc
@@ -126,6 +126,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) {
                         0.5000f, 0.5000f, 1.0000f, 1.0000f});
   test.AddAttribute("mode", "bicubic");
   test.AddOutput<float>("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f});
+  test.SetOutputTolerance(0.0001f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
 }
 
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 98fb62e435f31..655c4951f262d 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -160,6 +160,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -172,6 +173,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16Input) {
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.SetOutputTolerance(0.0001f);
+
   // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider,
@@ -228,6 +231,9 @@ TEST(LayerNormTest, LayerNorm17_double) {
   test.AddInput<double>("x", dims, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0});
   test.AddInput<double>("gamma", {3}, {1.0, 1.0, 1.0});
   test.AddOutput<double>("output", dims, {-1.2247, 0.0, 1.2247, -1.2247, 0.0, 1.2247});
+
+  test.SetOutputTolerance(0.0001f);
+
   // DNNL does not support double
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 72a5ba4dcefbf..8d7629b5fda1c 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -127,7 +127,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant,
 
   if (std::is_same_v<OType, float>) {
     test.AddOutput<float>("Y", {M, N}, Y_data);
-    test.SetOutputAbsErr("Y", 0.0001f);
+    test.SetOutputAbsErr("Y", 0.001f);
     test.SetOutputRelErr("Y", 0.02f);
   } else {
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(Y_data));
diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc
index ebb0261deefa5..263ace25ddfe0 100644
--- a/onnxruntime/test/contrib_ops/moe_test.cc
+++ b/onnxruntime/test/contrib_ops/moe_test.cc
@@ -14,6 +14,7 @@ static void RunMoETest(
     const std::vector<float>& router_probs,
     const std::vector<float>& fc1_experts_weights,
     const std::vector<float>& fc2_experts_weights,
+    const std::vector<float>& fc3_experts_weights,
     const std::vector<float>& fc1_experts_bias,
     const std::vector<float>& fc2_experts_bias,
     const std::vector<float>& output_data,
@@ -22,19 +23,23 @@ static void RunMoETest(
     int hidden_size,
     int inter_size,
     std::string activation_type,
+    int normalize_routing_weights = 0,
+    int top_k = 1,
     bool use_float16 = false) {
   int min_cuda_architecture = use_float16 ? 530 : 0;
 
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   if (enable_cuda) {
     OpTester tester("MoE", 1, onnxruntime::kMSDomain);
-    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(1));
+    tester.AddAttribute<int64_t>("k", static_cast<int64_t>(top_k));
     tester.AddAttribute<std::string>("activation_type", activation_type);
+    tester.AddAttribute<int64_t>("normalize_routing_weights", static_cast<int64_t>(normalize_routing_weights));
 
     std::vector<int64_t> input_dims = {num_rows, hidden_size};
     std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
     std::vector<int64_t> fc1_experts_weights_dims = {num_experts, hidden_size, inter_size};
     std::vector<int64_t> fc2_experts_weights_dims = {num_experts, inter_size, hidden_size};
+    std::vector<int64_t> fc3_experts_weights_dims = fc1_experts_weights_dims;
     std::vector<int64_t> fc1_experts_bias_dims = {num_experts, inter_size};
     std::vector<int64_t> fc2_experts_bias_dims = {num_experts, hidden_size};
     std::vector<int64_t> output_dims = {num_rows, hidden_size};
@@ -43,18 +48,42 @@ static void RunMoETest(
       tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
       tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
       tester.AddInput<MLFloat16>("fc1_experts_weights", fc1_experts_weights_dims, ToFloat16(fc1_experts_weights));
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
       tester.AddInput<MLFloat16>("fc2_experts_weights", fc2_experts_weights_dims, ToFloat16(fc2_experts_weights));
-      tester.AddInput<MLFloat16>("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias));
-      tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<MLFloat16>("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias));
+      } else {
+        tester.AddOptionalInputEdge<MLFloat16>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<MLFloat16>("fc3_experts_weights", fc3_experts_weights_dims, ToFloat16(fc3_experts_weights));
+      }
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       tester.AddInput<float>("input", input_dims, input);
       tester.AddInput<float>("router_probs", router_probs_dims, router_probs);
       tester.AddInput<float>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
+      if (!fc1_experts_bias.empty()) {
+        tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
       tester.AddInput<float>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
-      tester.AddInput<float>("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias);
-      tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      if (!fc2_experts_bias.empty()) {
+        tester.AddInput<float>("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias);
+      } else {
+        tester.AddOptionalInputEdge<float>();
+      }
+      if (!fc3_experts_weights.empty()) {
+        tester.AddInput<float>("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights);
+      }
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
@@ -231,6 +260,7 @@ TEST(MoETest, MoETest_Gelu) {
              router_probs,
              fc1_experts_weights,
              fc2_experts_weights,
+             {},
              fc1_experts_bias,
              fc2_experts_bias,
              output,
@@ -409,6 +439,7 @@ TEST(MoETest, MoETest_Relu) {
              router_probs,
              fc1_experts_weights,
              fc2_experts_weights,
+             {},
              fc1_experts_bias,
              fc2_experts_bias,
              output,
@@ -419,5 +450,143 @@ TEST(MoETest, MoETest_Relu) {
              "relu");
 }
 
+TEST(MoETest, MoETest_Mixtral) {
+  int num_rows = 6;
+  int num_experts = 8;
+  int hidden_size = 4;
+  int inter_size = 8;
+
+  const std::vector<float> input = {
+      0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f, 0.1995587f, -1.2461947f,
+      0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f, 0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f,
+      0.35569248f, 1.2056456f, 1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f};
+  const std::vector<float> router_probs = {
+      -0.09331456f, -0.47121337f, 0.07311103f, 0.47643483f, 0.21135253f, -0.72226393f, -0.048502743f, 0.39447474f,
+      -0.9014899f, -0.36629856f, -0.23088816f, -0.099606544f, -0.45191774f, -0.30394578f, 0.6266495f, 0.67937183f,
+      0.27117345f, -0.36059442f, 0.81510246f, 0.61359257f, 0.07649982f, -0.44949868f, -0.54758865f, 0.4736983f,
+      0.21584567f, 0.21296778f, 0.093342215f, -0.09353682f, 0.61422515f, 0.19574627f, 0.0063361377f, -0.2465148f,
+      0.15675665f, -0.4546509f, 0.24447554f, 0.5921611f, -0.18192923f, -0.66116416f, -0.40265432f, 0.33475468f,
+      1.2906091f, 0.4709078f, 0.16256471f, 0.19308007f, 0.97568524f, 0.25876164f, -0.7964541f, -1.0319631f};
+  const std::vector<float> fc1_experts_weights = {
+      0.3860137f, 0.077925384f, 0.13434184f, 0.28902978f, 0.25391752f, -0.38351142f, 0.15813059f, 0.031481862f,
+      0.083209574f, 0.4039817f, -0.13558972f, -0.21858627f, -0.30475253f, 0.41026944f, -0.008697987f, -0.3412701f,
+      -0.16235226f, 0.054659843f, 0.21042877f, 0.28863233f, -0.49495423f, 0.14401567f, 0.39130414f, 0.154176f,
+      0.30897498f, -0.15768659f, 0.44641107f, 0.089463115f, -0.19318026f, 0.20710677f, -0.3552568f, -0.17219114f,
+      0.41923493f, -0.4233985f, -0.41503525f, 0.19466156f, -0.08633667f, 0.45547962f, -0.054792404f, 0.26722562f,
+      -0.09923202f, 0.3460176f, -0.49708033f, -0.41033173f, 0.10443485f, -0.39646107f, -0.37424505f, 0.1757198f,
+      0.43019837f, -0.13757241f, 0.14305532f, 0.37121457f, 0.2581259f, 0.12583363f, 0.45542932f, 0.16247797f,
+      0.15579104f, -0.19166303f, -0.109221935f, -0.36702687f, 0.40365517f, -0.21506298f, -0.36697525f, -0.2703231f,
+      -0.49740213f, -0.3486371f, 0.24005288f, -0.0048963428f, 0.20468098f, -0.09111178f, -0.1485982f, -0.088219464f,
+      0.33463532f, -0.49346995f, 0.42075223f, -0.38025302f, -0.245484f, -0.35191745f, 0.3086716f, -0.2423737f,
+      0.37881732f, -0.40608948f, 0.26193494f, -0.4283861f, -0.10062629f, -0.32670784f, -0.16040438f, -0.15297079f,
+      0.1822241f, 0.37285012f, 0.12654608f, -0.46767431f, -0.28775263f, 0.16585541f, -0.36678362f, -0.4759978f,
+      -0.34751755f, -0.3163945f, -0.3858195f, -0.38030273f, -0.06156373f, -0.04352224f, -0.4041785f, -0.335764f,
+      -0.10303855f, -0.4009425f, -0.1236487f, -0.40111196f, 0.23985302f, -0.118291676f, -0.26773083f, 0.121197104f,
+      0.3702919f, -0.34168184f, 0.33743858f, 0.24873763f, -0.23140603f, -0.25351608f, 0.48291886f, 0.13780516f,
+      0.25632292f, -0.49343884f, 0.08369112f, -0.37192065f, -0.05451995f, -0.44571918f, -0.24150735f, 0.27395487f,
+      -0.20423341f, -0.024149835f, 0.40208143f, -0.18211937f, -0.19767642f, -0.19397742f, -0.1510992f, 0.48074025f,
+      0.18377024f, -0.18288034f, 0.08111167f, 0.12729281f, 0.27861303f, 0.0076527f, 0.36356348f, -0.24359548f,
+      -0.33313757f, -0.374829f, -0.08705664f, 0.23576546f, -0.39819986f, -0.09880793f, -0.012998581f, -0.36475456f,
+      -0.32685202f, 0.29657948f, -0.4631365f, -0.06320876f, 0.31600899f, 0.060619473f, 0.39029974f, 0.401151f,
+      0.15562236f, 0.43565983f, -0.058149397f, 0.36150748f, 0.10750586f, -0.063970566f, -0.47026545f, -0.3035437f,
+      -0.38143605f, -0.4734699f, 0.31273925f, -0.43410504f, 0.07299572f, 0.47506f, 0.021913886f, -0.036100805f,
+      -0.31637233f, 0.37718338f, -0.046213806f, 0.19239199f, 0.13676548f, 0.33592474f, -0.34048676f, -0.11097133f,
+      -0.41569126f, -0.01680845f, 0.31357706f, 0.0943895f, -0.24053341f, -0.018784225f, 0.40659577f, 0.08897692f,
+      0.3793823f, -0.3271106f, 0.067666054f, -0.12331611f, -0.010209799f, -0.48908865f, 0.19195485f, -0.45211792f,
+      0.48282713f, 0.4363466f, -0.40184838f, -0.025082052f, -0.31057972f, 0.14850605f, 0.39756012f, -0.25782883f,
+      0.3181312f, 0.17685872f, -0.16694272f, -0.41516554f, -0.062004805f, -0.33060408f, -0.13665432f, -0.43781847f,
+      -0.298562f, 0.013283849f, 0.48130906f, -0.27970356f, 0.20347959f, -0.24402553f, -0.20528454f, -0.114435256f,
+      0.12556863f, -0.4344011f, 0.2868948f, 0.19894183f, -0.12849897f, -0.18726158f, -0.4850099f, -0.4352169f,
+      -0.40527463f, 0.13625044f, -0.49707252f, -0.45698053f, 0.28196156f, 0.16826987f, -0.25944453f, 0.2801003f,
+      0.21121234f, -0.04066527f, 0.45854944f, -0.17861038f, 0.18178529f, 0.17789757f, 0.34227383f, 0.26976448f,
+      0.15789884f, 0.22840887f, 0.419321f, -0.14490443f, 0.39608955f, -0.4162954f, -0.47072983f, 0.41119635f};
+  const std::vector<float> fc2_experts_weights = {
+      0.10833451f, 0.34020698f, -0.18258394f, -0.17842063f, -0.07365984f, -0.29177922f, -0.24102151f, 0.1077901f,
+      0.2932343f, -0.35068116f, 0.1875877f, 0.07474385f, -0.20955177f, -0.27660736f, -0.14290786f, -0.09014153f,
+      -0.21085852f, -0.2378315f, 0.21457997f, 0.21074237f, -0.21087126f, 0.14320332f, -0.08389844f, 0.24034885f,
+      0.31800103f, 0.12659892f, 0.20224877f, -0.2563875f, 0.11782206f, 0.29377612f, -0.27469966f, -0.18875091f,
+      0.32136288f, 0.0788243f, -0.26413083f, 0.18453442f, 0.0776935f, -0.19561274f, 0.12608862f, 0.18579696f,
+      0.045481127f, -0.17894714f, 0.27366453f, 0.13220324f, -0.3115706f, -0.016884197f, -0.3328494f, -0.062126897f,
+      0.14841764f, 0.19741052f, 0.08211302f, -0.09362138f, -0.053040292f, -0.090344846f, 0.18264277f, 0.037823465f,
+      -0.16197139f, -0.20172869f, 0.064109616f, -0.062456656f, 0.30368346f, -0.12107184f, -0.12590908f, -0.10535928f,
+      0.1978099f, 0.13119277f, 0.21948591f, -0.080250844f, -0.24614547f, 0.33202717f, 0.2645375f, -0.21193951f,
+      0.17770219f, -0.04986229f, 0.33435768f, -0.0309231f, 0.16043694f, -0.0027341924f, -0.08339601f, -0.17402375f,
+      0.2525901f, -0.0813988f, -0.2904943f, -0.14452116f, -0.27119386f, -0.2952116f, 0.0794895f, -0.11223866f,
+      0.25427446f, 0.16967128f, 0.19531254f, -0.33598322f, -0.16714293f, -0.35097876f, -0.35189477f, 0.2900932f,
+      0.26874313f, -0.1322388f, -0.330179f, 0.064027935f, 0.19688474f, -0.20129368f, 0.006225848f, 0.19252343f,
+      -0.35054854f, -0.31874785f, 0.32238203f, 0.29287276f, 0.03135616f, 0.015792634f, 0.20397249f, -0.3245995f,
+      0.21416605f, 0.15667121f, -0.2058509f, 0.23639117f, -0.032677338f, 0.07826358f, -0.04589425f, -0.24935842f,
+      -0.20834164f, 0.069915086f, -0.26063374f, 0.13239416f, 0.33705652f, -0.26813045f, -0.17056243f, 0.29919288f,
+      0.27704936f, -0.096224755f, 0.13250813f, 0.26709175f, -0.26995474f, 0.3261805f, -0.18062393f, -0.04732303f,
+      -0.02733084f, 0.050550338f, -0.2937818f, -0.19453493f, -0.34864828f, -0.20862648f, -0.19311349f, 0.17665526f,
+      -0.2894185f, -0.020016002f, 0.3409702f, -0.18320526f, 0.068286195f, 0.08490415f, 0.30223787f, -0.2386011f,
+      0.09405743f, 0.123811804f, 0.31660154f, -0.11290163f, 0.07494662f, -0.24999082f, 0.2075398f, 0.07419645f,
+      0.3327035f, -0.09647329f, 0.24138254f, -0.32546985f, 0.033594366f, 0.16555631f, 0.33516192f, -0.32619375f,
+      0.20476541f, -0.07724f, 0.018923176f, -0.21126744f, 0.2744358f, -0.23979841f, -0.30413106f, -0.3485449f,
+      0.2854276f, 0.14391156f, -0.24802732f, -0.21701548f, -0.122100174f, 0.054206114f, -0.21961808f, 0.13481297f,
+      -0.07907457f, 0.15763119f, -0.31156835f, 0.29488218f, 0.17039073f, 0.35125035f, -0.17721775f, -0.10516899f,
+      0.072144486f, -0.038529005f, -0.058253434f, 0.13062657f, -0.3312356f, -0.15963489f, -0.20129326f, 0.014987925f,
+      0.30869225f, 0.283981f, -0.057181682f, 0.15174268f, 0.22181617f, -0.19763571f, 0.28675067f, 0.0003976555f,
+      -0.34610963f, 0.2931936f, -0.26233214f, 0.19563977f, -0.16886877f, 0.022812065f, 0.080249704f, -0.2798801f,
+      0.11531327f, 0.07107194f, -0.34746924f, -0.051920194f, -0.07264093f, 0.27581826f, 0.18536879f, 0.15684144f,
+      -0.26691115f, -0.22811417f, -0.1498502f, -0.176639f, -0.25876564f, -0.16051741f, -0.0048792143f, -0.08490091f,
+      0.18136817f, 0.24729891f, 0.32358363f, -0.09566104f, 0.3074607f, -0.24191524f, -0.21220984f, -0.23039621f,
+      0.21154472f, -0.19495378f, 0.002779711f, -0.34692943f, 0.055384878f, 0.25809082f, 0.16814983f, 0.19935164f,
+      0.11652225f, 0.1115539f, -0.24407779f, 0.09392998f, 0.33556697f, 0.11422251f, 0.34336287f, -0.33113837f};
+  const std::vector<float> fc3_experts_weights = {
+      0.45783097f, -0.2863351f, 0.011728346f, -0.43760604f, 0.15407985f, 0.07818556f, 0.0013856292f, -0.34319758f,
+      -0.16871625f, 0.12490183f, -0.34154075f, -0.31836903f, -0.46634215f, -0.43996066f, -0.1860516f, -0.2917009f,
+      -0.1772582f, -0.06599659f, -0.42419833f, 0.49980444f, -0.3283869f, -0.21543652f, -0.034647882f, -0.17114872f,
+      -0.4837973f, -0.362943f, -0.27533132f, 0.09443748f, -0.16642791f, -0.2993343f, -0.33881485f, -0.39464045f,
+      0.31960344f, 0.007296145f, -0.45412838f, -0.024868786f, -0.16298121f, -0.44197202f, 0.07232875f, -0.32362783f,
+      0.42969978f, -0.029854119f, -0.18451887f, -0.30145288f, 0.16885209f, -0.30068123f, -0.12948537f, 0.36494362f,
+      -0.049498677f, 0.12020564f, 0.42106473f, -0.30590254f, 0.31881082f, -0.078908324f, 0.20685762f, -0.22735089f,
+      -0.11194843f, 0.14011681f, 0.19477749f, -0.44788343f, 0.23084867f, 0.48367476f, -0.19044077f, -0.100233376f,
+      0.4191656f, -0.4515314f, -0.3214385f, 0.016065598f, -0.4069137f, -0.17348295f, -0.43329984f, 0.33521235f,
+      -0.07843453f, -0.4865722f, -0.039011598f, -0.10605621f, 0.4192536f, 0.04063064f, 0.1984514f, 0.49294376f,
+      -0.056941032f, 0.18582922f, -0.16650558f, -0.17215621f, -0.20009357f, 0.46615022f, 0.47462142f, -0.0766145f,
+      -0.20405996f, -0.27452308f, -0.16176039f, -0.23940295f, 0.13248974f, 0.23036134f, 0.13154167f, 0.10377723f,
+      0.0070211887f, 0.29162645f, 0.34465307f, -0.4058748f, -0.13989884f, -0.12305027f, -0.2541607f, 0.4767149f,
+      0.4549045f, -0.108933926f, 0.2452516f, 0.054080307f, 0.33768386f, -0.45279485f, 0.1557768f, 0.17416143f,
+      -0.42602575f, -0.102350116f, 0.16022503f, 0.14813942f, 0.03982985f, -0.47012872f, -0.14555538f, 0.35645115f,
+      -0.1909796f, -0.20839584f, -0.28098184f, -0.23085594f, 0.022559166f, -0.23900753f, -0.19561106f, -0.24205637f,
+      0.2573983f, -0.2947166f, 0.4568925f, 0.11514187f, 0.18671238f, -0.121082425f, 0.3909887f, -0.10985571f,
+      -0.19420451f, -0.3255307f, 0.4863913f, 0.007830441f, 0.4648854f, -0.24156213f, 0.22956276f, -0.09216207f,
+      -0.29428315f, 0.26062596f, 0.14955276f, -0.036366224f, -0.12957954f, 0.08501935f, -0.36796576f, 0.041123867f,
+      0.06744653f, -0.0839923f, 0.17207885f, 0.006872058f, -0.21135789f, 0.3732242f, -0.2683524f, -0.45898575f,
+      -0.14543939f, 0.30806476f, 0.08574325f, 0.027492225f, -0.38164973f, -0.040038824f, -0.26947904f, -0.09740937f,
+      0.26697665f, -0.43565083f, 0.1359719f, 0.12271714f, 0.0149876475f, -0.44011843f, 0.26128954f, -0.42487514f,
+      -0.24668545f, 0.06113738f, -0.29119557f, 0.194273f, -0.24981815f, 0.3489496f, -0.47321397f, -0.31794417f,
+      -0.23641628f, 0.44169098f, -0.006898284f, 0.43446392f, -0.39553195f, 0.057907403f, -0.19339961f, -0.08160931f,
+      0.4979084f, -0.11149913f, 0.35366338f, -0.16032219f, -0.48278677f, 0.08397317f, 0.4008311f, 0.30288273f,
+      0.2546957f, -0.10675722f, 0.069722414f, 0.456497f, -0.19691509f, 0.49017924f, 0.41796166f, -0.2337895f,
+      -0.3635872f, -0.45445484f, -0.29122698f, -0.4339773f, 0.15762383f, 0.09782606f, -0.27986187f, -0.23860168f,
+      0.38454843f, -0.07870716f, 0.15390605f, -0.15793777f, 0.48130733f, 0.288768f, 0.45969498f, -0.4193731f,
+      -0.3218134f, -0.29914904f, -0.3426242f, 0.06931591f, -0.2633695f, -0.25429398f, 0.25366426f, -0.27700734f,
+      0.49418402f, -0.21919805f, 0.041192472f, -0.19817531f, -0.49578953f, 0.48185098f, -0.41920406f, -0.08335745f,
+      0.19111753f, -0.07547706f, 0.049694f, 0.13012594f, 0.2617172f, -0.22612399f, 0.32247066f, -0.33702326f,
+      0.20062232f, -0.09143996f, -0.063310504f, 0.1885702f, 0.11926836f, 0.3378734f, -0.45973647f, 0.48845494f};
+  const std::vector<float> output = {
+      0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f, 0.024002096f, -0.061423667f, 0.034824893f,
+      -0.022858473f, 0.04693405f, -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f, 0.053492386f,
+      -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f, 0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f};
+
+  RunMoETest(input,
+             router_probs,
+             fc1_experts_weights,
+             fc2_experts_weights,
+             fc3_experts_weights,
+             {},
+             {},
+             output,
+             num_rows,
+             num_experts,
+             hidden_size,
+             inter_size,
+             "silu",
+             1, /*normalize_routing_weights*/
+             2 /*top_k*/);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
index 31ef62e69bb88..09baf8def05f6 100644
--- a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc
@@ -433,8 +433,7 @@ static void RunModelWithRandomInput(
   std::vector<int64_t> token_offset_dims{batch_size, sequence_length};
   std::vector<int64_t> cum_seq_len_dims{batch_size + 1};
 
-  // TF32 in SM >= 80 is enabled by default, need larger threshold for float when TF32 is enabled.
-  float gpu_threshold = is_float16 ? 0.15f : (HasCudaEnvironment(800) ? 0.05f : 0.005f);
+  float gpu_threshold = is_float16 ? 0.15f : 0.005f;
   gpu_threshold *= sequence_length > 1024 ? 4.0f : 1.0f;  // threshold should increase with sequence length
   bool enable_cuda = HasCudaEnvironment(is_float16 ? 530 : 0);
   if (enable_cuda) {
diff --git a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
index 22253955566f2..5f811c8cf35f6 100644
--- a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc
@@ -107,6 +107,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+      tester.SetOutputTolerance(0.005f);
     } else {
       if (is_packed_qkv) {
         tester.AddInput<float>("query", packed_qkv_dims, query_data);
@@ -131,6 +132,7 @@ static void RunPackedMultiHeadAttentionTest(
       }
 
       tester.AddOutput<float>("output", output_dims, output_data);
+      tester.SetOutputTolerance(0.001f, 0.001f);
     }
 
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index fd222583ac67f..54dd831fe2fc2 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -90,11 +90,13 @@ void RunQAttention(const std::vector<float>& input_data,
     tester.AddInput<MLFloat16>("input_scale", {1}, ToFloat16({input_quant_params.scale}));
     tester.AddInput<MLFloat16>("weight_scale", {1}, ToFloat16({weight_quant_params.scale}));
     tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+    tester.SetOutputTolerance(0.01f);
   } else {
     tester.AddInput<float>("bias", bias_dims, bias_data);
     tester.AddInput<float>("input_scale", {1}, {input_quant_params.scale});
     tester.AddInput<float>("weight_scale", {1}, {weight_quant_params.scale});
     tester.AddOutput<float>("output", output_dims, output_data);
+    tester.SetOutputTolerance(0.005f);
   }
 
   if (mask_index_data.size() > 0) {
diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc
index 733bc9f01fd11..d987a1cae427d 100644
--- a/onnxruntime/test/contrib_ops/sampling_test.cc
+++ b/onnxruntime/test/contrib_ops/sampling_test.cc
@@ -8,6 +8,10 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/cuda_op_test_utils.h"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 extern std::unique_ptr<Ort::Env> ort_env;
 
 namespace onnxruntime {
@@ -65,7 +69,10 @@ TEST(SamplingTest, Gpt2Sampling_GPU) {
     LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture";
     return;
   }
-  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+
+  OrtCUDAProviderOptionsV2 cuda_options;
+  cuda_options.use_tf32 = false;
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else  // USE_ROCM
   OrtROCMProviderOptions rocm_options;
   // TODO - verify the default settings
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 97e5615c677d0..0c4a37bd48d4a 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -2944,6 +2944,11 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) {
 }
 
 // test inter thread pool with setting denormal as zero
+#if !defined(__APPLE__)
+// TODO (hasesh): Debug this test failure on MacOS 12 with XCode 14.2
+// It seemingly passes on MacOS 13 with XCode 15.x but we had to drop down to Mac OS 12
+// because at the time of writing this, Mac OS 13 images were making CI/Packaging pipelines
+// very unstable.
 TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
   if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
     GTEST_SKIP() << "Skipping the test";
@@ -3001,6 +3006,7 @@ TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
   VerifyThreadPoolWithDenormalAsZero(session2.GetIntraOpThreadPoolToUse(), false);
   VerifyThreadPoolWithDenormalAsZero(session2.GetInterOpThreadPoolToUse(), false);
 }
+#endif
 
 TEST(InferenceSessionTests, ModelWithAbsolutePathForExternalTensorData) {
   SessionOptions so;
diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc
index bfabcd567803b..f5258760eb20d 100644
--- a/onnxruntime/test/framework/shape_inference_test.cc
+++ b/onnxruntime/test/framework/shape_inference_test.cc
@@ -5,13 +5,16 @@
 #include <unordered_map>
 
 #include "gtest/gtest.h"
+#include "core/common/span_utils.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "test/framework/model_builder_utils.h"
+#include "test/util/include/asserts.h"
 #include "test/util/include/test_utils.h"
+#include "test/util/include/inference_session_wrapper.h"
 #include "test/test_environment.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 
 namespace onnxruntime {
 namespace test {
@@ -22,7 +25,7 @@ class ShapeInferenceTest : public ::testing::Test {
  protected:
   onnxruntime::Model model_;
   int node_count_;
-  std::unordered_map<string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
+  std::unordered_map<std::string, std::unique_ptr<onnxruntime::NodeArg>> name_to_arg_;
 
  public:
   ShapeInferenceTest() : model_("Test", false, DefaultLoggingManager().DefaultLogger()), node_count_(0) {}
@@ -73,5 +76,91 @@ TEST_F(ShapeInferenceTest, BasicTest) {
   CheckShapeEquality(InputShape(node), OutputShape(node));
 }
 
+namespace {
+struct MyCustomKernelWithOptionalInput {
+  MyCustomKernelWithOptionalInput(const OrtKernelInfo* /*info*/) {
+  }
+
+  OrtStatusPtr ComputeV2(OrtKernelContext* /* context */) const {
+    return nullptr;
+  }
+};
+
+struct MyCustomOpWithOptionalInput : Ort::CustomOpBase<MyCustomOpWithOptionalInput,
+                                                       MyCustomKernelWithOptionalInput,
+                                                       true> {
+  explicit MyCustomOpWithOptionalInput(const char* provider) : provider_(provider) {}
+
+  OrtStatusPtr CreateKernelV2(const OrtApi& /* api */, const OrtKernelInfo* info, void** kernel) const {
+    *kernel = new MyCustomKernelWithOptionalInput(info);
+    return nullptr;
+  };
+
+  const char* GetName() const { return "FooBar"; };
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
+    // The second input (index == 1) is optional
+    if (index == 1)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+ private:
+  const char* provider_;
+};
+
+const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata/foo_bar_2.onnx");
+
+}  // namespace
+
+// CustomOps Output type inference function quits if it
+// encounters the an output that is optional and absent.
+// It quits without any errors or logging. We want to make sure
+// that inference proceeds for all of the outputs when absent optional inputs are present
+TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) {
+  MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
+
+  const auto& env = GetEnvironment();
+
+  Ort::CustomOpDomain op_domain("test");
+  op_domain.Add(&custom_op);
+
+  std::initializer_list<OrtCustomOpDomain*> op_domains = {static_cast<OrtCustomOpDomain*>(op_domain)};
+
+  SessionOptions sess_opts;
+  sess_opts.inter_op_param.thread_pool_size = 1;
+  sess_opts.intra_op_param.thread_pool_size = 1;
+
+  InferenceSessionWrapper session{sess_opts, env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2};
+  ASSERT_STATUS_OK(session.AddCustomOpDomains(AsSpan(op_domains)));
+
+  ASSERT_STATUS_OK(session.Load());
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const onnxruntime::Model& model = session.GetModel();
+  const auto& graph = model.MainGraph();
+  const auto& nodes = graph.Nodes();
+  for (const auto& node : nodes) {
+    if (node.OpType() == "FooBar") {
+      // check inferred shapes
+      const auto* node_arg = node.OutputDefs()[0];
+      const auto* type_proto = node_arg->TypeAsProto();
+      ASSERT_NE(nullptr, type_proto);
+      ASSERT_EQ(ONNX_NAMESPACE::TypeProto::ValueCase::kTensorType, type_proto->value_case());
+      ASSERT_EQ(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, type_proto->tensor_type().elem_type());
+    }
+  }
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 5a2104ffeb0da..0d55fd19b918a 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -25,6 +25,10 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "nlohmann/json.hpp"
 
+#ifdef USE_CUDA
+#include "core/providers/cuda/cuda_provider_options.h"
+#endif
+
 using namespace onnxruntime;
 
 namespace {
@@ -341,11 +345,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     logging_level = ORT_LOGGING_LEVEL_VERBOSE;
   }
 
-  if (concurrent_session_runs > 1 && repeat_count > 1) {
-    fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n");
-    usage();
-    return -1;
-  }
   argc -= optind;
   argv += optind;
   if (argc < 1) {
@@ -406,12 +405,15 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
-      OrtCUDAProviderOptions cuda_options;
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
+#ifdef USE_CUDA
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.device_id = device_id;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
+#endif
 #else
       fprintf(stderr, "TensorRT is not supported in this build");
       return -1;
@@ -429,10 +431,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     }
     if (enable_cuda) {
 #ifdef USE_CUDA
-      OrtCUDAProviderOptions cuda_options;
+      OrtCUDAProviderOptionsV2 cuda_options;
       cuda_options.do_copy_in_default_stream = true;
+      cuda_options.use_tf32 = false;
       // TODO: Support arena configuration for users of test runner
-      sf.AppendExecutionProvider_CUDA(cuda_options);
+      sf.AppendExecutionProvider_CUDA_V2(cuda_options);
 #else
       fprintf(stderr, "CUDA is not supported in this build");
       return -1;
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index c254d340cdcb8..e6f0a259805e5 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -518,7 +518,7 @@ TEST(NhwcTransformerTests, ConvMixTensorRanks) {
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 
-std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
+static std::vector<MLFloat16> ARangeOfFP16Values(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
   std::vector<MLFloat16> val(detail::SizeFromDims(shape));
   float start = min.ToFloat();
   float end = max.ToFloat();
@@ -534,22 +534,22 @@ std::vector<MLFloat16> randomfp16(const std::vector<int64_t>& shape, MLFloat16 m
   return val;
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInput<MLFloat16>(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInput<MLFloat16>(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInputARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                    MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInput<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
-template <>
-NodeArg* ModelTestBuilder::MakeInitializer(const std::vector<int64_t>& shape, MLFloat16 min, MLFloat16 max) {
-  return MakeInitializer(shape, randomfp16(shape, min, max));
+static NodeArg* MakeInitializerARangeFP16(ModelTestBuilder& builder, const std::vector<int64_t>& shape,
+                                          MLFloat16 min, MLFloat16 max) {
+  return builder.MakeInitializer<MLFloat16>(shape, ARangeOfFP16Values(shape, min, max));
 }
 
 TEST(NhwcTransformerTests, ConvFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* output_arg = builder.MakeOutput();
-      auto* weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, weight_arg, output_arg);
     };
@@ -575,10 +575,10 @@ TEST(NhwcTransformerTests, ConvFp16) {
 TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<MLFloat16>(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
       auto* conv_output_arg = builder.MakeIntermediate();
       auto* output_arg = builder.MakeOutput();
-      auto* conv_weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
+      auto* conv_weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f));
 
       builder.AddConvNode(input_arg, conv_weight_arg, conv_output_arg);
       Node& pool_node = builder.AddNode("MaxPool", {conv_output_arg}, {output_arg});
@@ -609,13 +609,13 @@ TEST(NhwcTransformerTests, ConvMaxPoolFp16) {
 
 TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* gavgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
@@ -640,13 +640,13 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) {
 
 TEST(NhwcTransformerTests, ConvAveragePoolFp16) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput<MLFloat16>({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f));
     auto* conv1_output_arg = builder.MakeIntermediate();
     auto* conv2_output_arg = builder.MakeIntermediate();
     auto* avgpool1_output_arg = builder.MakeIntermediate();
     auto* output_arg = builder.MakeOutput();
-    auto* conv1_weight_arg = builder.MakeInitializer<MLFloat16>({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
-    auto* conv2_weight_arg = builder.MakeInitializer<MLFloat16>({16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
+    auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f));
 
     Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg);
     conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 6e10763d8f293..9743ed18a6cc0 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -247,7 +247,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
       if (key == "device_type") {
         std::set<std::string> ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32",
                                                            "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16",
-                                                           "GPU.0_FP16", "GPU.1_FP16"};
+                                                           "GPU.0_FP16", "GPU.1_FP16", "NPU"};
         if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) {
           ov_options[key] = value;
         } else if (value.find("HETERO:") == 0) {
@@ -260,7 +260,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
           ORT_THROW(
               "[ERROR] [OpenVINO] You have selected a wrong configuration value for the key 'device_type'. "
               "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', "
-              "'GPU.0_FP16', 'GPU.1_FP16' or from"
+              "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from"
               " HETERO/MULTI/AUTO options available. \n");
         }
       } else if (key == "device_id") {
diff --git a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
index f0582d41734bd..eb7345be3770b 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
+++ b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj
@@ -49,6 +49,7 @@
 		229E595826586B4A006E41AE /* sigmoid.ort */ = {isa = PBXFileReference; lastKnownFileType = file; path = sigmoid.ort; sourceTree = "<group>"; };
 		22C1D8DE271A79AF002CEE67 /* ios_package_testUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ios_package_testUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		22C1D8E9271A79FD002CEE67 /* ios_package_uitest_cpp_api.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_package_uitest_cpp_api.mm; sourceTree = "<group>"; };
+		513C65792B85789400E4EDFD /* ios_package_test.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = ios_package_test.entitlements; sourceTree = "<group>"; };
 		51C316B92B0881450033C70B /* macos_package_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = macos_package_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		51C316BB2B0881450033C70B /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		51C316BC2B0881450033C70B /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@@ -117,6 +118,7 @@
 		229E591E265869BF006E41AE /* ios_package_test */ = {
 			isa = PBXGroup;
 			children = (
+				513C65792B85789400E4EDFD /* ios_package_test.entitlements */,
 				229E591F265869BF006E41AE /* AppDelegate.h */,
 				229E5920265869BF006E41AE /* AppDelegate.m */,
 				229E5928265869BF006E41AE /* Main.storyboard */,
@@ -521,8 +523,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -530,9 +535,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
@@ -541,8 +546,11 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-				CODE_SIGN_STYLE = Automatic;
+				CODE_SIGNING_REQUIRED = NO;
+				CODE_SIGNING_STYLE = Automatic;
+				CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements;
 				INFOPLIST_FILE = ios_package_test/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -550,9 +558,9 @@
 				PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
-				SUPPORTS_MACCATALYST = NO;
+				SUPPORTS_MACCATALYST = YES;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;
-				TARGETED_DEVICE_FAMILY = 1;
+				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
@@ -563,7 +571,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -585,7 +593,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				GENERATE_INFOPLIST_FILE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 13.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 14.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
new file mode 100644
index 0000000000000..ee95ab7e582d4
--- /dev/null
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.security.app-sandbox</key>
+	<true/>
+	<key>com.apple.security.network.client</key>
+	<true/>
+</dict>
+</plist>
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index e94f8c2673be3..8d84c689cd23e 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -120,6 +120,20 @@ void BaseTester::SetOutputRelErr(const char* name, float v) {
   it->validation_params.relative_error = optional<float>(v);
 }
 
+void BaseTester::SetOutputTolerance(float abs_error, float rel_error) {
+  for (auto& output : output_data_) {
+    if (output.def.Exists()) {
+      if (abs_error >= 0.0f) {
+        output.validation_params.absolute_error = optional<float>(abs_error);
+      }
+
+      if (rel_error >= 0.0f) {
+        output.validation_params.relative_error = optional<float>(rel_error);
+      }
+    }
+  }
+}
+
 std::vector<int64_t> BaseTester::GetDimsForProto(gsl::span<const int64_t> dims) {
   std::vector<int64_t> dims_for_proto{dims.begin(), dims.end()};
   if (add_symbolic_dim_to_tensor_data_ >= 0 &&
diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h
index 5607e58315a12..c276ae494df43 100644
--- a/onnxruntime/test/providers/base_tester.h
+++ b/onnxruntime/test/providers/base_tester.h
@@ -519,9 +519,20 @@ class BaseTester {
     custom_session_registries_.push_back(registry);
   }
 
+  // For floating types (double/float/half/bfloat16), tolerance is similar to numpy.isclose:
+  //   absolute(expected_value - actual_value) <= abs_error + rel_error * absolute(expected_value)
+  // For integer types, tolerance parameters are ignored except the following cases:
+  //   For uint8, tolerance is only applied to NNAPI/XNNPACK/DML providers.
+  //   For int8, only abs_error is used, and rel_error is ignored. See checkers.cc for detail.
+  // If abs_error or rel_error is not set, a default value is used (search DefaultTolerance for detail).
   void SetOutputAbsErr(const char* name, float v);
   void SetOutputRelErr(const char* name, float v);
 
+  // Set absolute and relative tolerance for all existed outputs.
+  // Negative value will be ignored.
+  // Note that it will not set tolerance for new outputs added after this call.
+  void SetOutputTolerance(float abs_error, float rel_error = -1.0f);
+
   // Number of times to call InferenceSession::Run. The same feeds are used each time.
   // e.g. used to verify the generator ops behave as expected
   void SetNumRunCalls(int n) {
diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc
index c97e6d9de4911..47c18c478dd9c 100644
--- a/onnxruntime/test/providers/checkers.cc
+++ b/onnxruntime/test/providers/checkers.cc
@@ -20,46 +20,87 @@ struct DefaultTolerance;
 
 template <>
 struct DefaultTolerance<double> {
-  static constexpr float absolute = 1e-6f;
+  static constexpr float absolute = 1e-5f;
   static constexpr float relative = 1e-5f;
+
+  // Allow to have different default absolute tolerance for different providers.
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<float> {
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 1e-3f;
+#else
   static constexpr float absolute = 1e-5f;
+#endif
+
   static constexpr float relative = 1e-4f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<MLFloat16> {
-  // The thresholds are estimated with PyTorch script like the following:
+#if defined(ENABLE_TRAINING)
+  static constexpr float absolute = 0.005f;
+#else
+  // The thresholds for inference are estimated with PyTorch script like the following:
   //    x = torch.rand(1000, 1000)
   //    absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10
   //    x[abs(x) < absolute] = absolute
   //    relative = ((x - x.to(torch.float16)) / x).abs().max() * 2
   static constexpr float absolute = 0.0025f;
+#endif
+
   static constexpr float relative = 0.001f;
+
+  static float get_absolute(const std::string& provider_type) {
+    if (provider_type == kDmlExecutionProvider) {
+      return 0.005f;
+    }
+    return absolute;
+  }
 };
 
 template <>
 struct DefaultTolerance<BFloat16> {
+  // The thresholds for inference are estimated with PyTorch script like the following:
+  //    x = torch.rand(1000, 1000)
+  //    absolute = ((x + 1e-6).to(torch.bfloat16) - x).abs().max() * 10
+  //    x[abs(x) < absolute] = absolute
+  //    relative = ((x - x.to(torch.bfloat16)) / x).abs().max() * 2
   static constexpr float absolute = 0.02f;
   static constexpr float relative = 0.01f;
+
+  static float get_absolute(const std::string& /*provider_type*/) {
+    return absolute;
+  }
+};
+
+struct ToleranceParams {
+  float absolute;
+  float relative;
 };
 
 template <typename T>
-T get_tolerance(float absolute, float relative, T expected_value) {
+ToleranceParams get_tolerance_params(const ValidateOutputParams& params, const std::string& provider_type) {
+  ToleranceParams new_params;
+  new_params.absolute = params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<T>::get_absolute(provider_type);
+  new_params.relative = params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<T>::relative;
+  return new_params;
+}
+
+template <typename T>
+T get_tolerance(const ToleranceParams& params, T expected_value) {
   static_assert(std::is_floating_point<T>::value, "T must be a floating point type");
 
   // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
-  return static_cast<T>(absolute) + static_cast<T>(relative) * std::abs(expected_value);
-}
-
-template <typename T, typename D>  // D is the original data type
-T get_tolerance(const ValidateOutputParams& params, T expected_value) {
-  float absolute = (params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance<D>::absolute);
-  float relative = (params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance<D>::relative);
-  return get_tolerance<T>(absolute, relative, expected_value);
+  return static_cast<T>(params.absolute) + static_cast<T>(params.relative) * std::abs(expected_value);
 }
 
 template <typename T>
@@ -201,7 +242,10 @@ struct TensorCheck<int8_t> {
       cur_actual = actual.template Data<int8_t>();
     }
 
-    const bool has_abs_err = params.absolute_error.has_value();
+    // When absolute error is less than 1 for int8, it has same effect as no tolerance.
+    const bool has_abs_err = params.absolute_error.has_value() && *(params.absolute_error) >= 1.0f;
+
+    // TODO: the relative error is not used for int8 yet.
     if (has_abs_err) {
       double threshold = *(params.absolute_error);
 
@@ -221,11 +265,9 @@ struct TensorCheck<double> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto size = actual.Shape().Size();
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
     // deal with rare cases in which order of output data from a kernel MAY be
     // undefined
     Tensor expected_sorted, actual_sorted;
@@ -240,10 +282,7 @@ struct TensorCheck<double> {
       cur_actual = actual.Data<double>();
     }
 
-    double threshold = 0.001;
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-    threshold = 0.005;
-#endif
+    auto tolerance_params = get_tolerance_params<double>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -253,7 +292,7 @@ struct TensorCheck<double> {
       } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        double tolerance = has_tolerance ? get_tolerance<double, double>(params, cur_expected[i]) : threshold;
+        double tolerance = get_tolerance<double>(tolerance_params, cur_expected[i]);
         EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -264,9 +303,7 @@ template <typename T>
 void InternalNumericalCheck(const Tensor& expected,
                             const Tensor& actual,
                             const ValidateOutputParams& params,
-                            const std::string& /*provider_type*/) {
-  const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
+                            const std::string& provider_type) {
   // deal with rare cases in which order of output data from a kernel MAY be
   // undefined
   Tensor expected_sorted, actual_sorted;
@@ -282,11 +319,7 @@ void InternalNumericalCheck(const Tensor& expected,
     cur_actual = actual.Data<T>();
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  constexpr float threshold = 0.005f;
-#else
-  constexpr float threshold = 0.0001f;
-#endif
+  auto tolerance_params = get_tolerance_params<T>(params, provider_type);
 
   for (int64_t i = 0; i < size; ++i) {
     // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified.
@@ -296,7 +329,7 @@ void InternalNumericalCheck(const Tensor& expected,
     } else if (std::isinf(cur_expected[i])) {  // Test infinity for equality
       EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i;
     } else {
-      T tolerance = has_tolerance ? get_tolerance<T, T>(params, cur_expected[i]) : threshold;
+      T tolerance = get_tolerance<T>(tolerance_params, cur_expected[i]);
       EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i;
     }
   }
@@ -317,7 +350,7 @@ struct TensorCheck<MLFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<MLFloat16>();
     auto* cur_actual = actual.Data<MLFloat16>();
     auto size = actual.Shape().Size();
@@ -333,21 +366,15 @@ struct TensorCheck<MLFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
+    auto tolerance_params = get_tolerance_params<MLFloat16>(params, provider_type);
 
-    float threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM)
-    threshold = 0.005f;
-#elif defined(USE_DML)
-    threshold = 0.02f;
-#endif
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
         EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i;
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance ? get_tolerance<float, MLFloat16>(params, f_expected[i]) : threshold;
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
@@ -359,7 +386,7 @@ struct TensorCheck<BFloat16> {
   void operator()(const Tensor& expected,
                   const Tensor& actual,
                   const ValidateOutputParams& params,
-                  const std::string& /*provider_type*/) const {
+                  const std::string& provider_type) const {
     auto* cur_expected = expected.Data<BFloat16>();
     auto* cur_actual = actual.Data<BFloat16>();
     auto size = actual.Shape().Size();
@@ -375,13 +402,7 @@ struct TensorCheck<BFloat16> {
       sort_expected_and_actual_buffers<float>(f_expected, f_actual);
     }
 
-    const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value();
-
-    float abs_threshold = 0.0001f;
-    float rel_threshold = 0.001f;
-#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL)
-    rel_threshold = 0.05f;  // expect at least 95% close
-#endif
+    auto tolerance_params = get_tolerance_params<BFloat16>(params, provider_type);
 
     for (int64_t i = 0; i < size; ++i) {
       if (std::isnan(f_expected[i])) {
@@ -389,9 +410,7 @@ struct TensorCheck<BFloat16> {
       } else if (std::isinf(f_expected[i])) {  // Test infinity for equality
         EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i;
       } else {
-        float tolerance = has_tolerance
-                              ? get_tolerance<float, BFloat16>(params, f_expected[i])
-                              : get_tolerance<float>(abs_threshold, rel_threshold, f_expected[i]);
+        float tolerance = get_tolerance<float>(tolerance_params, f_expected[i]);
         EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i;
       }
     }
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index acd513172f95d..d2e883331acd4 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -697,7 +697,9 @@ TEST(LeakyReluGradInferenceTest, Basic) {
 
 // Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation.
 // It will run the DNNL Gelu op which only be part of standard of Gelu-20 op.
-#if !defined(USE_DNNL) && !defined(USE_QNN)
+// [TODO] Temporarily ignore this test for OpenVINO to avoid an exception due to mishandling of the
+// approximate parameter. Re-enable it later when the issue is fixed
+#if !defined(USE_DNNL) && !defined(USE_QNN) && !defined(USE_OPENVINO)
 TEST_F(ActivationOpTest, ONNX_Gelu) {
   TestActivationOp<float>(
       "Gelu",
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 984b8f4437a3b..9a74d763a13e3 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -69,6 +69,11 @@ inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>
       test.SetOutputRelErr("Y", .000001f);
     }
 #endif
+
+    if (strcmp(szOp, "QuickGelu") == 0) {
+      test.SetOutputTolerance(0.0001f);
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers);
   }
 }
diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc
index 4e968d3de6b8a..423ea3f682f4c 100644
--- a/onnxruntime/test/providers/cpu/math/einsum_test.cc
+++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/framework/data_types.h"
 #include "core/util/math.h"
 
@@ -50,7 +51,7 @@ TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_With_Broadcasting) {
   test.AddAttribute<std::string>("equation", "...i->i...");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
@@ -58,7 +59,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji->...ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -75,7 +76,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedTransposeOp_3D_input) {
   test.AddAttribute<std::string>("equation", "...ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Axis/Axes reduction
@@ -102,7 +103,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_0) {
   test.AddAttribute<std::string>("equation", "...ji->...j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2, 2}, {3.f, 7.f, 3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
@@ -110,7 +111,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) {
   test.AddAttribute<std::string>("equation", "...ji->...");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("y", {2}, {10.f, 10.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -144,7 +145,7 @@ TEST(Einsum, ExplicitEinsumAsOuterProductWithTransposeOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 30.f, 36.f, 20.f, 24.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit
@@ -155,7 +156,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_2D_input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
@@ -165,7 +166,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) {
   test.AddInput<float>("y", {2}, {3.f, 4.f});
   test.AddInput<float>("z", {2}, {5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 // Theme: MatMul
 
@@ -233,7 +234,7 @@ TEST(Einsum, ExplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 81.f, 54.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmul) {
@@ -251,7 +252,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
@@ -260,7 +261,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {14.f, 20.f, 30.f, 44.f, 14.f, 20.f, 30.f, 44.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsMatmul_OutputTransposed) {
@@ -303,7 +304,7 @@ TEST(Einsum, ImplicitEinsumAsMatmul_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {37.f, 54.f, 81.f, 118.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsBatchedMatmul) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -320,7 +321,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedMatmulWithBroadcasting_0) {
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsMatmul_2) {
@@ -343,7 +344,7 @@ TEST(Einsum, DiagonalWithMatmul) {
   test.AddInput<float>("x", {2, 2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
   test.AddInput<float>("y", {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
   test.AddOutput<float>("o", {3}, {60.f, 72.f, 84.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Diagonal parsing
@@ -354,7 +355,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii->i");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
@@ -362,7 +363,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii->i");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -370,7 +371,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji->j");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
@@ -378,7 +379,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) {
   test.AddAttribute<std::string>("equation", "iji->ij");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 3.f, 2.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
@@ -386,7 +387,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {1.f, 2.f, 3.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // ROCm doesn't support double
@@ -396,7 +397,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_double) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<double>("x", {2, 2, 2}, {1., 2., 3., 4., 1., 2., 3., 4.});
   test.AddOutput<double>("o", {2, 2}, {1., 2., 3., 4.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 #endif
 
@@ -405,7 +406,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int32) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int32_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int32_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
@@ -413,14 +414,14 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) {
   test.AddAttribute<std::string>("equation", "iji->ji");
   test.AddInput<int64_t>("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4});
   test.AddOutput<int64_t>("o", {2, 2}, {1, 2, 3, 4});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
   test.AddAttribute<std::string>("equation", "...ii->...i");
   test.AddInput<float>("x", {3, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {3, 2}, {1.f, 4.f, 1.f, 4.f, 1.f, 4.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
@@ -428,7 +429,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij->...j");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Implicit (Implicit diagonal ops will sum up diagonal values)
@@ -442,7 +443,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp) {
   test.AddAttribute<std::string>("equation", "ii");
   test.AddInput<float>("x", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
@@ -455,7 +456,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "iii");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {}, {5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
@@ -463,7 +464,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) {
   test.AddAttribute<std::string>("equation", "iji");
   test.AddInput<float>("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2}, {3.f, 7.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
@@ -471,7 +472,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) {
   test.AddAttribute<std::string>("equation", "...ii");
   test.AddInput<float>("x", {2, 1, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 1}, {5.f, 5.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
@@ -479,7 +480,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) {
   test.AddAttribute<std::string>("equation", "...iij");
   test.AddInput<float>("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {4.f, 6.f, 4.f, 6.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Theme: Scalar inputs and outputs
@@ -491,7 +492,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
@@ -501,7 +502,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddInput<float>("z", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {100.f, 200.f, 300.f, 400.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -527,7 +528,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithOneScalar) {
   test.AddInput<float>("x", {}, {10.f});
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, 3.f, 4.f});
   test.AddOutput<float>("o", {2, 2}, {10.f, 20.f, 30.f, 40.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
@@ -538,7 +539,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) {
   test.AddInput<float>("c", {}, {10.f});
   test.AddInput<float>("d", {}, {10.f});
   test.AddOutput<float>("o", {2, 2}, {1000.f, 2000.f, 3000.f, 4000.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithAllScalars) {
   OpTester test("Einsum", 12, onnxruntime::kOnnxDomain);
@@ -568,7 +569,7 @@ TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeFinal) {
   test.AddInput<float>("y", {2, 2}, {1.f, 2.f, -6.f, 2.f});
   test.AddInput<float>("z", {2, 2}, {3.f, 4.f, 5.f, 6.f});
   test.AddOutput<float>("o", {2, 2, 2}, {63.f, -132.f, 63.f, -132.f, 63.f, -132.f, 63.f, -132.f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeLeft) {
@@ -720,7 +721,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) {
   ConvertFloatToMLFloat16(output_f.data(), output.data(), 2);
   test.AddInput<MLFloat16>("x", {2, 2}, input_x);
   test.AddOutput<MLFloat16>("o", {2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
@@ -741,7 +742,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) {
   test.AddInput<MLFloat16>("x", {}, input_x);
   test.AddInput<MLFloat16>("y", {2, 2}, input_y);
   test.AddOutput<MLFloat16>("o", {2, 2}, output);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) {
@@ -2093,7 +2094,7 @@ TEST_P(EinsumTransposeMatMulThreeInputsTest, EinsumTransposeMatMulThreeInputsTes
   std::vector<int64_t> v1(tst.shape.begin(), tst.shape.end());
   std::vector<float> v2(tst.expected.begin(), tst.expected.end());
   test.AddOutput<float>("o", v1, v2);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 INSTANTIATE_TEST_SUITE_P(EinsumTransposeMatMulThreeInputsTests, EinsumTransposeMatMulThreeInputsTest, testing::ValuesIn(case1));
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d35e5c78cfd69..c02486a2ec26f 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -6,8 +6,10 @@
 #include "test/util/include/default_providers.h"
 #include "test/common/dnnl_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 #include "core/util/math.h"
 #include <algorithm>
+#include <limits>
 #include <math.h>
 
 namespace onnxruntime {
@@ -1370,7 +1372,8 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS
 
   test.AddOutput<element_type>("sum", dims, expected_output_data);
 
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(MathOpTest, SumMultipleInputsNoBroadcasting) {
@@ -1506,6 +1509,34 @@ TEST(MathOpTest, Min_12_Float_2_Input) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Float_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("min", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -1.0f, -1.0f, -2.0f,
+                         0.5f, 0.0f, 1.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Double) {
   OpTester test("Min", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1523,6 +1554,34 @@ TEST(MathOpTest, Min_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Min_12_Double_Nan) {
+  OpTester test("Min", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("min", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -1.0, -1.0, -2.0,
+                          0.5, 0.0, 1.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Min_12_Int32) {
   OpTester test("Min", 12);
   test.AddInput<int32_t>("data_0", {1, 3},
@@ -1629,6 +1688,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) {
                             MakeMLFloat16({-10.f, -10.f, -10.f}));
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
+
 TEST(MathOpTest, Max_6) {
   OpTester test("Max", 6);
   std::vector<int64_t> dims{3, 3};
@@ -1717,6 +1777,34 @@ TEST(MathOpTest, Max_12_Float) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Float_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<float>("data_2", {3, 3},
+                       {std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        std::numeric_limits<float>::quiet_NaN(),
+                        -0.5f, 0.0f, -2.0f,
+                        0.5f, 0.0f, 2.0f});
+  test.AddInput<float>("data_1", {3, 1},
+                       {0.0f, -1.0f, 1.0f});
+  test.AddOutput<float>("max", {3, 3},
+                        {std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         std::numeric_limits<float>::quiet_NaN(),
+                         -0.5f, 0.0f, -1.0f,
+                         1.0f, 1.0f, 2.0f});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Double) {
   OpTester test("Max", 12);
   test.AddInput<double>("data_0", {1, 3},
@@ -1734,6 +1822,34 @@ TEST(MathOpTest, Max_12_Double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: Input batch size is inconsistent
 }
 
+TEST(MathOpTest, Max_12_Double_Nan) {
+  OpTester test("Max", 12);
+  test.AddInput<double>("data_2", {3, 3},
+                        {std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         std::numeric_limits<double>::quiet_NaN(),
+                         -0.5, 0.0, -2.0,
+                         0.5, 0.0, 2.0});
+  test.AddInput<double>("data_1", {3, 1},
+                        {0.0, -1.0, 1.0});
+  test.AddOutput<double>("max", {3, 3},
+                         {std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          std::numeric_limits<double>::quiet_NaN(),
+                          -0.5, 0.0, -1.0,
+                          1.0, 1.0, 2.0});
+  if (nullptr != DefaultCpuExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+  if (nullptr != DefaultCudaExecutionProvider().get()) {
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+}
+
 TEST(MathOpTest, Max_12_Int32) {
   OpTester test("Max", 12);
   test.AddInput<int32_t>("data_0", {1, 3},
@@ -2630,7 +2746,7 @@ TEST(MathOpTest, Mean_8) {
 #endif
 
 template <float (&op)(float value) MATH_NO_EXCEPT>
-void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
+void TrigFloatTest(OpTester& test, std::initializer_list<float> input, float abs_error = -1.0f) {
   std::vector<int64_t> dims{static_cast<int64_t>(input.size())};
 
   std::vector<float> output;
@@ -2639,6 +2755,11 @@ void TrigFloatTest(OpTester& test, std::initializer_list<float> input) {
 
   test.AddInput<float>("X", dims, input);
   test.AddOutput<float>("Y", dims, output);
+
+  if (abs_error >= 0.0f) {
+    test.SetOutputTolerance(abs_error);
+  }
+
   test.Run();
 }
 
@@ -2708,6 +2829,7 @@ TEST(MathOpTest, CosFloat16) {
     TrigFloat16Test<::cosf>(test, {1.1f, -1.1f, 2.2f, -2.2f});
   }
 }
+
 TEST(MathOpTest, Tan) {
   OpTester test("Tan");
   TrigFloatTest<::tanf>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f});
@@ -2715,7 +2837,8 @@ TEST(MathOpTest, Tan) {
 
 TEST(MathOpTest, Asin) {
   OpTester test("Asin");
-  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f});
+  float abs_error = DefaultDmlExecutionProvider().get() != nullptr ? 0.0001f : -1.0f;
+  TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}, abs_error);
 }
 
 TEST(MathOpTest, Acos) {
diff --git a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
index 273503e7bf6af..f057e4a071bd9 100644
--- a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc
@@ -15,7 +15,8 @@ static void RunTest(const std::vector<float>& x_vals,
                     int64_t axis = 1,
                     bool is_tensorrt_supported = true,
                     OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
-                    const std::string& error_msg = "") {
+                    const std::string& error_msg = "",
+                    float tolerance = 0.0f) {
   OpTester tester("LogSoftmax", opset);
 
   if (opset < 13) {
@@ -31,6 +32,10 @@ static void RunTest(const std::vector<float>& x_vals,
   tester.AddInput("X", dimensions, x_vals);
   tester.AddOutput("Y", dimensions, expected_vals);
 
+  if (tolerance != 0.0f) {
+    tester.SetOutputAbsErr("Y", tolerance);
+  }
+
   std::unordered_set<std::string> excluded_providers;
   if (!is_tensorrt_supported) {
     excluded_providers.insert(kTensorrtExecutionProvider);
@@ -62,7 +67,7 @@ TEST(LogSoftmaxOperator, LargeNumber) {
                                       -3.4401896f, -2.4401896f, -1.44018972f, -0.44018969f};
   std::vector<int64_t> dimensions = {2, 4};
 
-  RunTest(x_vals, expected_vals, dimensions);
+  RunTest(x_vals, expected_vals, dimensions, 7, 1, true, OpTester::ExpectResult::kExpectSuccess, "", 0.0005f);
 }
 
 // np.random.seed(123)   # Use a seed so we can replicate the input and expected values here and in python
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index af71fe5cf79ae..aa752ed7308c6 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -92,27 +92,13 @@ TEST_P(ModelTest, Run) {
   // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure
   if (model_path.find(ORT_TSTR("_MNIST")) > 0) {
     if (provider_name == "cuda" || provider_name == "openvino") {
+      per_sample_tolerance = 2.5e-2;
       relative_per_sample_tolerance = 1e-2;
     }
   }
 
   std::unique_ptr<OnnxModelInfo> model_info = std::make_unique<OnnxModelInfo>(model_path.c_str());
 
-#if defined(__linux__)
-  // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test.
-  if (HasCudaEnvironment(800) && provider_name == "cuda") {
-    per_sample_tolerance = 1e-1;
-    if (model_path.find(ORT_TSTR("SSD")) > 0 ||
-        model_path.find(ORT_TSTR("ssd")) > 0 ||
-        model_path.find(ORT_TSTR("yolov3")) > 0 ||
-        model_path.find(ORT_TSTR("mask_rcnn")) > 0 ||
-        model_path.find(ORT_TSTR("FNS")) > 0) {
-      SkipTest("Skipping SSD test for big tolearance failure or other errors");
-      return;
-    }
-  }
-#endif
-
   if (model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) ||
       model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) {
     SkipTest("it has the training domain. No pipeline should need to run these tests.");
@@ -192,12 +178,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
-        std::vector<const char*> keys{"device_id"};
 
+        std::vector<const char*> keys{"device_id", "use_tf32"};
         std::vector<const char*> values;
         std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
         values.push_back(device_id.empty() ? "0" : device_id.c_str());
-        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1));
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "rocm") {
         OrtROCMProviderOptions ep_options;
@@ -229,6 +217,14 @@ TEST_P(ModelTest, Run) {
         ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options));
         std::unique_ptr<OrtCUDAProviderOptionsV2, decltype(&OrtApis::ReleaseCUDAProviderOptions)> rel_cuda_options(
             cuda_options, &OrtApis::ReleaseCUDAProviderOptions);
+
+        std::vector<const char*> keys{"device_id", "use_tf32"};
+        std::vector<const char*> values;
+        std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
+        values.push_back(device_id.empty() ? "0" : device_id.c_str());
+        values.push_back("0");
+        ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2));
+
         ortso.AppendExecutionProvider_CUDA_V2(*cuda_options);
       } else if (provider_name == "migraphx") {
         OrtMIGraphXProviderOptions ep_options;
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index 3d30fc62a945d..d91a1de3faa6e 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -905,14 +905,16 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
+
   // mean and variance of X across channel dimension
   // With Opset9 we output saved_inv_std instead of saved_var to match CUDA EP
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -938,10 +940,11 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) {
   test.AddInput<float>("var", channel_dims, {1.0f, 2.0f});
 
   test.AddOutput<float>("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f});
-
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // exclude CUDA Execution Provider due to flakiness
   // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm()
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
@@ -970,6 +973,8 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) {
   test.AddOutput<float>("running_mean", channel_dims, {-0.1754f, 0.303106f});
   test.AddOutput<float>("running_var", channel_dims, {0.696052f, 1.41316f});
 
+  test.SetOutputTolerance(0.0001f);
+
   // Same exclusions as the opset 14 test
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
            {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider,
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index e24cda17166ed..c8cf183291518 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -58,7 +58,7 @@ TEST(PoolTest, MaxPool) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   // TensorRT: result differs
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Only CUDA kernel has float 16 support
@@ -117,7 +117,7 @@ TEST(PoolTest, MaxPool_F16) {
   test.AddInput<MLFloat16>("X", x_dims, f_X);
   test.AddOutput<MLFloat16>("Y", expected_dims, f_Y);
   // TensorRT: Assertion `!attrs.count("pads")' failed
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 #endif
 
@@ -170,7 +170,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) {
                        : test.AddOutput<int64_t>("Indices", expected_dims, expected_indices_col);
   }
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kDnnlExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider,
+           {kDnnlExecutionProvider, kTensorrtExecutionProvider,
             kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider});
 }
 
@@ -185,7 +185,7 @@ TEST(PoolTest, MaxPool_8_With_Index) {
   MaxPool_8_WithIndexTest(true, 1 /*storage_order*/);  // col major
 }
 
-TEST(PoolTest, MaxPool1D) {
+TEST(PoolTest, MaxPool1D_case1) {
   OpTester test("MaxPool");
 
   test.AddAttribute("auto_pad", "");
@@ -200,7 +200,45 @@ TEST(PoolTest, MaxPool1D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PoolTest, MaxPool1D_case2) {
+  OpTester test("MaxPool");
+  // no padding
+  test.AddAttribute("auto_pad", "VALID");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  test.AddAttribute("pads", vector<int64_t>{0, 0});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // The last dim is (5-2+1)/1 = 4
+  std::vector<int64_t> expected_dims = {1, 1, 4};
+  std::vector<float> expected_vals = {2, 3, 4, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PoolTest, MaxPool1D_case3) {
+  OpTester test("MaxPool");
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{1});
+  // Pad one element
+  test.AddAttribute("pads", vector<int64_t>{0, 1});
+  test.AddAttribute("kernel_shape", vector<int64_t>{2});
+
+  std::vector<float> x_vals = {1, 2, 3, 4, 5};
+  std::vector<int64_t> x_dims = {1, 1, 5};
+  // Since we padded it, the last dim is larger compared to the case above
+  std::vector<int64_t> expected_dims = {1, 1, 5};
+  std::vector<float> expected_vals = {2, 3, 4, 5, 5};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
@@ -222,7 +260,7 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) {
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_8_With_Index) {
@@ -249,7 +287,7 @@ static void MaxPool1D_12_WithIndexTest_int8(int64_t storage_order) {
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
@@ -271,7 +309,7 @@ static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) {
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
   test.AddOutput<int64_t>("Indices", expected_dims, expected_indices);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool1D_12_With_Index_8bits) {
@@ -309,9 +347,9 @@ TEST(PoolTest, MaxPool2D_uint8) {
 
   test.AddOutput<uint8_t>("Output", output_shape, output);
 #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16)
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
 #else
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 #endif
 }
 
@@ -337,7 +375,7 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations) {
@@ -357,7 +395,7 @@ TEST(PoolTest, MaxPool_DefaultDilations) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_int8) {
@@ -377,7 +415,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
@@ -397,7 +435,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_uint8) {
 
   test.AddInput<uint8_t>("X", x_dims, x_vals);
   test.AddOutput<uint8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_1d) {
@@ -451,7 +489,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
@@ -479,7 +517,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d_int8) {
 
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_2d) {
@@ -536,7 +574,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
@@ -565,7 +603,7 @@ TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) {
   test.AddInput<int8_t>("X", x_dims, x_vals);
   test.AddOutput<int8_t>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
@@ -595,7 +633,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, MaxPool_10_DilationPadding_3d) {
@@ -707,7 +745,7 @@ TEST(PoolTest, GlobalMaxPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalMaxPool3D) {
@@ -783,7 +821,7 @@ TEST(PoolTest, GlobalMaxPool3D) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool) {
@@ -864,7 +902,7 @@ TEST(PoolTest, AveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_IncludePadPixel) {
@@ -888,7 +926,8 @@ TEST(PoolTest, AveragePool_IncludePadPixel) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.SetOutputTolerance(0.0001f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // test 'strides' attribute not specified
@@ -907,7 +946,7 @@ TEST(PoolTest, AveragePool_DefaultStrides) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_10_ceil1_2d) {
@@ -931,7 +970,7 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider});
+           {kTensorrtExecutionProvider, kAclExecutionProvider});
 }
 
 TEST(PoolTest, AveragePool_19_dilation_2d) {
@@ -955,7 +994,9 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider,
+            kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 TEST(PoolTest, GlobalAveragePool) {
@@ -1031,7 +1072,7 @@ TEST(PoolTest, GlobalAveragePool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_128) {
@@ -1044,7 +1085,7 @@ TEST(PoolTest, GlobalAveragePool_Large_128) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, GlobalAveragePool_Large_256) {
@@ -1057,7 +1098,7 @@ TEST(PoolTest, GlobalAveragePool_Large_256) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals,
                         /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {});
 }
 
 TEST(PoolTest, LpPool) {
@@ -1364,7 +1405,7 @@ TEST(PoolTest, LpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 // test data generated with lp_pool_test_generator.py
@@ -1396,7 +1437,8 @@ TEST(PoolTest, LpPool1d) {
 
       // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
       // TensorRT does not support 1d pooling
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+               {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
       y_count++;
     }
 }
@@ -1428,7 +1470,7 @@ TEST(PoolTest, LpPool2d) {
       test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]);
 
       test.AddOutput<float>("Y", y_sizes[y_count], ys[y_count]);
-      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+      test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
       y_count++;
     }
 }
@@ -1446,7 +1488,8 @@ TEST(PoolTest, LpPoolCeilMode) {
 
   // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060
   // TensorRT does not support 1d pooling
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PoolTest, GlobalLpPool) {
@@ -1701,7 +1744,7 @@ TEST(PoolTest, GlobalLpPool) {
 
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider});
 }
 
 TEST(PoolTest, MaxPoolDimWithZeroForN) {
@@ -1719,7 +1762,7 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) {
   test.AddInput<float>("X", x_dims, x_vals);
   test.AddOutput<float>("Y", expected_dims, expected_vals);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider});
+           {kTensorrtExecutionProvider, kQnnExecutionProvider});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 2f97f6e71e92b..58a616717316e 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -463,6 +464,7 @@ static void BasicTest() {
                                            0.3661f,
                                            0.2349f,
                                        });
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -689,6 +691,7 @@ TEST(RoiAlignTest, MaxModePositive) {
                                           });*/
   test.Run();
 }
+
 TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
@@ -713,7 +716,8 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidMode) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
@@ -738,7 +742,8 @@ TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) {
   test.AddInput<int64_t>("batch_indices", {5}, {0, 0, 0, 0, 0});
   test.AddOutput<float>("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f});
 
-  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0");
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0", ExcludeTrtOnA100());
 }
 
 TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) {
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index 7e81fc80ddf85..e73a1b492cc05 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -143,6 +143,8 @@ static void RunLstmTest(const std::vector<float>& X_data,
     test.AddOptionalOutputEdge<float>();
   }
 
+  test.SetOutputTolerance(0.0001f);
+
   // TensorRT failed on LSTM tests
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
diff --git a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
index 60e75811e4333..c2d64b8e5ee4a 100644
--- a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc
@@ -442,6 +442,19 @@ TEST(SequenceOpsTest, SplitToSequence_PositiveAxisScalarSplit) {
   test.Run();
 }
 
+TEST(SequenceOpsTest, SplitToSequence_StringSplit) {
+  OpTester test("SplitToSequence", 11);
+  test.AddInput<std::string>("input", {3}, std::vector<std::string>({"Test string", "Another string", "A third and much longer string"}));
+  int64_t axis = 0;
+  test.AddAttribute("axis", axis);
+  SeqTensors<std::string> output;
+  output.AddTensor({1}, {"Test string"});
+  output.AddTensor({1}, {"Another string"});
+  output.AddTensor({1}, {"A third and much longer string"});
+  test.AddSeqOutput("S2", output);
+  test.Run();
+}
+
 TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0UnevenSplitFloat) {
   OpTester test("SplitToSequence", 11);
   test.AddInput<float>("input", {5, 2}, GetConsecutiveVector<float>(1.f, 10));
diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
index e37e784f28930..1ffe6c73d4fa4 100644
--- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc
@@ -13,6 +13,7 @@ TEST(AffineGridTest, 2d) {
   test.AddInput<int64_t>("size", {4}, {1, 1, 2, 3});
   test.AddOutput<float>("grid", {1, 2, 3, 2},
                         {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -24,6 +25,7 @@ TEST(AffineGridTest, test_2d_0) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -33,6 +35,7 @@ TEST(AffineGridTest, test_2d_1) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -42,6 +45,7 @@ TEST(AffineGridTest, test_2d_2) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -51,6 +55,7 @@ TEST(AffineGridTest, test_2d_3) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -60,6 +65,7 @@ TEST(AffineGridTest, test_2d_4) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -69,6 +75,7 @@ TEST(AffineGridTest, test_2d_5) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -78,6 +85,7 @@ TEST(AffineGridTest, test_2d_6) {
   test.AddInput<float>("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {1, 1, 3, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -87,6 +95,7 @@ TEST(AffineGridTest, test_2d_7) {
   test.AddInput<float>("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f});
   test.AddInput<int64_t>("size", {4}, {2, 10, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -96,6 +105,7 @@ TEST(AffineGridTest, test_3d_0) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -105,6 +115,7 @@ TEST(AffineGridTest, test_3d_1) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -114,6 +125,7 @@ TEST(AffineGridTest, test_3d_2) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -123,6 +135,7 @@ TEST(AffineGridTest, test_3d_3) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -132,6 +145,7 @@ TEST(AffineGridTest, test_3d_4) {
   test.AddInput<float>("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -141,6 +155,7 @@ TEST(AffineGridTest, test_3d_5) {
   test.AddInput<float>("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -150,6 +165,7 @@ TEST(AffineGridTest, test_3d_6) {
   test.AddInput<float>("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {1, 1, 3, 2, 2});
   test.AddOutput<float>("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 
@@ -159,6 +175,7 @@ TEST(AffineGridTest, test_3d_7) {
   test.AddInput<float>("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f});
   test.AddInput<int64_t>("size", {5}, {2, 10, 2, 2, 3});
   test.AddOutput<float>("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f});
+  test.SetOutputTolerance(0.0001f);
   test.Run();
 }
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
index b6720ae2a9a7d..8dcb15cbc6926 100644
--- a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc
@@ -5,6 +5,7 @@
 
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/util/include/default_providers.h"
 
 namespace onnxruntime::test {
 
@@ -155,6 +156,10 @@ TEST(MeanVarianceNormalizationTest, AxesSubsets5D) {
     test.AddInput<float>("input", shape, X.data(), X.size());
     test.AddOutput<float>("output", shape, Y.data(), Y.size());
 
+    if (DefaultDmlExecutionProvider().get() != nullptr) {
+      test.SetOutputTolerance(0.001f);
+    }
+
     test.Run();
   };
 
diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
index a2ffbdcc0bdf1..55c247e4c2fea 100644
--- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
@@ -3,6 +3,7 @@
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/common/trt_op_test_utils.h"
 
 using namespace std;
 
@@ -36,7 +37,8 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float /*indices, output, depth*/) {
                          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
                          0., 0., 0., 0., 0., 0., 1., 0., 0., 0.});
-  test.Run();
+  // TRT EP segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
@@ -51,7 +53,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) {
                            0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                            0, 0, 0, 0, 0, 0, 1, 0, 0, 0});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) {
@@ -81,7 +83,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float /*indices, output, depth*/) {
                          0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32 /*indices, output, depth*/) {
@@ -231,7 +233,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float_NonZeroOffValue /*indices, outp
                          2., 2., 3., 2., 2., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 3., 2., 2., 2., 2., 2.,
                          2., 2., 2., 2., 2., 2., 3., 2., 2., 2.});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, output, depth*/) {
@@ -246,7 +248,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, outp
                            2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 3, 2, 2, 2, 2, 2,
                            2, 2, 2, 2, 2, 2, 3, 2, 2, 2});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int64_float_int64_NonZeroOffValue /*indices, output, depth*/) {
@@ -276,7 +278,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float_NonZeroOffValue /*indices, outp
                          2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
                          2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f});
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(OneHotOpTest, DefaultAxis_int32_float_int32_NonZeroOffValue /*indices, output, depth*/) {
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 062f25b989a70..496f2213e9d32 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -5,9 +5,11 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
+
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
@@ -243,7 +245,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
   std::vector<float> Y = {2.66666651f, 4.3333331f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
@@ -267,8 +272,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   test.AddOutput<float>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+  // TRT: Segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -315,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
   std::vector<int8_t> Y = {0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
@@ -347,7 +353,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) {
     std::vector<float> Y = {3.5f, 5.5f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -405,7 +411,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
     std::vector<float> Y = {1.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -608,7 +614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) {
         7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -725,7 +731,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
       4.0f, 4.5714290f, 5.142857f, 5.714286f, 6.285714f, 6.8571430f, 7.428571f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
@@ -819,7 +825,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
                             7.0f, 11.0f};
 
     test.AddOutput<float>("Y", {N, C, H, W}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -845,7 +851,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
@@ -867,7 +873,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_WithSizes) {
@@ -920,7 +926,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_half_pixel) {
                           14.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, sizes[2], sizes[3]}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_crop_and_resize_with_extrapolation) {
@@ -1000,7 +1006,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_WithSizes_CeilMode) {
@@ -1093,7 +1099,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
                           13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
@@ -1197,7 +1203,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Scales) {
                             3.0f, 3.0f, 4.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
   };
 
   run_test(false);
@@ -1262,7 +1268,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest) {
                           11.9165f, 13.2266f, 14.5278f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
@@ -1292,7 +1298,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) {
                           11.949f, 13.2503f, 14.5942f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
@@ -1319,7 +1325,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) {
                           11.8701f, 13.168f, 14.4912f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_with_roi) {
@@ -1373,7 +1379,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_asymmetric) {
                           11.375f, 12.6719f, 13.9688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
@@ -1405,7 +1411,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) {
                           13.375f, 13.7813f, 14.375f, 14.875f, 15.375f, 15.9688f, 16.375f, 16.4688f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_MultiChannel) {
@@ -1486,7 +1492,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
                           13.332f, 13.8086f, 14.4375f, 14.8438f, 15.4727f, 15.9492f, 16.2461f, 16.1758f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
@@ -1512,7 +1518,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
@@ -1538,7 +1547,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   std::vector<float> Y = {1.0f, 2.66666651f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
@@ -1574,7 +1583,10 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
       7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  // QNN: result diff
+  // TRT: segmentation fault in A100
+  std::unordered_set<std::string> excluded_providers({kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers));
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
@@ -1602,7 +1614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
       4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f};
 
   test.AddOutput<float>("Y", {static_cast<int64_t>(H * scales[0]), static_cast<int64_t>(W * scales[1])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
@@ -1627,7 +1639,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
                           7.0f, 11.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
@@ -1647,7 +1659,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) {
   std::vector<float> Y = {1.0f, 3.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
@@ -1668,10 +1680,10 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) {
                           3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
-TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
+TEST(ResizeOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
 
@@ -1686,7 +1698,7 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) {
   std::vector<float> Y = {1.0f, 2.0f, 3.0f, 4.0f};
 
   test.AddOutput<float>("Y", {N, C, H, W}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) {
@@ -1737,7 +1749,7 @@ void ResizeOpTypeCheck_Ver_10() {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver_10) {
@@ -1768,7 +1780,7 @@ void ResizeOpTypeCheck_Ver_11_13_18(int opset_version) {
                       3, 3, 3, 4, 4, 4};
 
   test.AddOutput<T>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(ResizeOpTest, ResizeOpTypeCheck_Ver11) {
diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
index 188532cfa350a..3ac8053aef95e 100644
--- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc
@@ -4,6 +4,7 @@
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
+#include "test/common/trt_op_test_utils.h"
 
 namespace onnxruntime {
 namespace test {
@@ -939,7 +940,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_opset9) {
       7, 7, 9, 9};
 
   test.AddOutput<int32_t>("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y);
-  test.Run();
+
+  // TRT: segmentation fault in A100
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100());
 }
 
 TEST(UpsampleOpTest, NhwcUpsampleOpNearest2XTest_opset9) {
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index eaef6f6315157..9eb75d297ef78 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -123,6 +123,8 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
       ++ep_context_node_count;
+      // validate the fix for the partition issue relate to QDQ model
+      ASSERT_EQ(node.InputDefs().size(), 1);
     } else {
       ++non_ep_context_node_count;
     }
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index c48b07422d452..e441230537410 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -140,8 +140,8 @@ def create_backend_test(test_name=None):
         if backend.supports_device("OPENVINO_CPU_FP16"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU_FP16"):
-            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU_FP16")
+        if backend.supports_device("OPENVINO_NPU"):
+            current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
         if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index fd1d58cd2a3b8..ec64f2359f4be 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -24,25 +24,17 @@ def get_size():
     return comm.Get_size()
 
 
-def barrier():
-    comm.Barrier()
-
-
 def print_out(*args):
     if get_rank() == 0:
         print(*args)
 
 
-def broadcast(data):
-    comm = MPI.COMM_WORLD
-    comm.broadcast(data, root=0)
-
-
 local_rank = get_rank()
 
 ORT_DTYPE = TensorProto.FLOAT16
 NP_TYPE = np.float16 if ORT_DTYPE == TensorProto.FLOAT16 else np.float32
-THRESHOLD = 1e-3
+THRESHOLD_TP = 3e-2
+THRESHOLD_EP = 1e-6
 
 
 def create_moe_onnx_graph(
@@ -52,12 +44,17 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
-    local_experts_start_index=-1,
+    fc3_experts_weights,
+    local_experts_start_index=0,
+    topk=2,
+    normalize_routing_weights=1,
+    activation_type="gelu",
+    tensor_shards=1,
 ):
-    use_sharded_moe = local_experts_start_index >= 0
+    use_sharded_moe = num_experts > local_num_experts or tensor_shards > 1
     nodes = [
         (
             helper.make_node(
@@ -66,14 +63,16 @@ def create_moe_onnx_graph(
                     "input",
                     "router_probs",
                     "fc1_experts_weights",
-                    "fc2_experts_weights",
                     "fc1_experts_bias",
+                    "fc2_experts_weights",
                     "fc2_experts_bias",
+                    "fc3_experts_weights",
                 ],
                 ["output"],
                 "MoE_0",
-                k=1,
-                activation_type="gelu",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
                 domain="com.microsoft",
             )
             if not use_sharded_moe
@@ -83,15 +82,18 @@ def create_moe_onnx_graph(
                     "input",
                     "router_probs",
                     "fc1_experts_weights",
-                    "fc2_experts_weights",
                     "fc1_experts_bias",
+                    "fc2_experts_weights",
                     "fc2_experts_bias",
+                    "fc3_experts_weights",
                 ],
                 ["output"],
                 "MoE_0",
-                k=1,
-                activation_type="gelu",
+                k=topk,
+                normalize_routing_weights=normalize_routing_weights,
+                activation_type=activation_type,
                 local_experts_start_index=local_experts_start_index,
+                tensor_shards=tensor_shards,
                 domain="com.microsoft",
             )
         ),
@@ -99,6 +101,7 @@ def create_moe_onnx_graph(
 
     fc1_shape = [local_num_experts, hidden_size, inter_size]
     fc2_shape = [local_num_experts, inter_size, hidden_size]
+    fc3_shape = fc1_shape
 
     initializers = [
         helper.make_tensor(
@@ -115,6 +118,13 @@ def create_moe_onnx_graph(
             fc2_experts_weights.flatten(),
             raw=False,
         ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.flatten(),
+            raw=False,
+        ),
     ]
 
     fc1_bias_shape = [local_num_experts, inter_size]
@@ -166,18 +176,18 @@ def create_moe_onnx_graph(
     return model.SerializeToString()
 
 
-def test_moe_with_expert_slicing(
+def generate_weights_and_initial_model(
+    num_rows,
+    num_experts,
     hidden_size,
     inter_size,
-    num_experts,
-    num_rows,
 ):
-    local_experts_start_index = local_rank * num_experts // get_size()
-
-    fc1_experts_weights_all = np.random.rand(num_experts, hidden_size, inter_size).astype(NP_TYPE)
-    fc2_experts_weights_all = np.random.rand(num_experts, inter_size, hidden_size).astype(NP_TYPE)
-    fc1_experts_bias_all = np.random.rand(num_experts, inter_size).astype(NP_TYPE)
-    fc2_experts_bias_all = np.random.rand(num_experts, hidden_size).astype(NP_TYPE)
+    s = 0.1
+    fc1_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc2_experts_weights_all = np.random.normal(scale=s, size=(num_experts, inter_size, hidden_size)).astype(NP_TYPE)
+    fc3_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE)
+    fc1_experts_bias_all = np.random.normal(scale=s, size=(num_experts, inter_size)).astype(NP_TYPE)
+    fc2_experts_bias_all = np.random.normal(scale=s, size=(num_experts, hidden_size)).astype(NP_TYPE)
 
     onnx_model_full = create_moe_onnx_graph(
         num_rows,
@@ -186,34 +196,31 @@ def test_moe_with_expert_slicing(
         hidden_size,
         inter_size,
         fc1_experts_weights_all,
-        fc2_experts_weights_all,
         fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
+        fc3_experts_weights_all,
     )
 
-    fc1_experts_weights = fc1_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc2_experts_weights = fc2_experts_weights_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
-    ]
-    fc1_experts_bias = fc1_experts_bias_all[
-        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
-    ]
-
-    onnx_model_local = create_moe_onnx_graph(
-        num_rows,
-        num_experts,
-        num_experts // get_size(),
-        hidden_size,
-        inter_size,
-        fc1_experts_weights,
-        fc2_experts_weights,
-        fc1_experts_bias,
+    return (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
         fc2_experts_bias_all,
-        local_experts_start_index,
+        fc3_experts_weights_all,
     )
 
+
+def run_ort_with_parity_check(
+    onnx_model_full,
+    onnx_model_local,
+    num_rows,
+    hidden_size,
+    num_experts,
+    inter_size,
+    threshold,
+):
     sess_options = onnxruntime.SessionOptions()
     cuda_provider_options = {"device_id": local_rank}
     execution_providers = [("CUDAExecutionProvider", cuda_provider_options)]
@@ -229,30 +236,161 @@ def test_moe_with_expert_slicing(
     output = ort_session.run(None, ort_inputs)
     sharded_output = ort_session_local.run(None, ort_inputs)
 
-    assert np.allclose(output[0], sharded_output[0], atol=THRESHOLD, rtol=THRESHOLD)
+    print_out("max diff:", np.max(np.abs(output[0] - sharded_output[0])))
+    assert np.allclose(output[0], sharded_output[0], atol=threshold, rtol=threshold)
 
     print_out(
-        "hidden_size: ",
+        "hidden_size:",
         hidden_size,
-        " inter_size: ",
+        " inter_size:",
         inter_size,
-        " num_experts: ",
+        " num_experts:",
         num_experts,
-        " num_rows: ",
+        " num_rows:",
         num_rows,
-        " world_size: ",
+        " world_size:",
         get_size(),
         " Parity: OK",
     )
 
 
+def test_moe_with_tensor_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_TP,
+):
+    assert inter_size % get_size() == 0
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    fc1_experts_weights = fc1_experts_weights_all[
+        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+    fc2_experts_weights = fc2_experts_weights_all[
+        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), :
+    ]
+    fc3_experts_weights = fc3_experts_weights_all[
+        :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+    fc1_experts_bias = fc1_experts_bias_all[
+        :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size()
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts,
+        hidden_size,
+        inter_size // get_size(),
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        tensor_shards=get_size(),
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
+def test_moe_with_expert_parallelism(
+    hidden_size,
+    inter_size,
+    num_experts,
+    num_rows,
+    threshold=THRESHOLD_EP,
+):
+    local_experts_start_index = local_rank * num_experts // get_size()
+
+    (
+        onnx_model_full,
+        fc1_experts_weights_all,
+        fc1_experts_bias_all,
+        fc2_experts_weights_all,
+        fc2_experts_bias_all,
+        fc3_experts_weights_all,
+    ) = generate_weights_and_initial_model(
+        num_rows,
+        num_experts,
+        hidden_size,
+        inter_size,
+    )
+
+    fc1_experts_weights = fc1_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc2_experts_weights = fc2_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc3_experts_weights = fc3_experts_weights_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, :
+    ]
+    fc1_experts_bias = fc1_experts_bias_all[
+        local_experts_start_index : local_experts_start_index + num_experts // get_size(), :
+    ]
+
+    onnx_model_local = create_moe_onnx_graph(
+        num_rows,
+        num_experts,
+        num_experts // get_size(),
+        hidden_size,
+        inter_size,
+        fc1_experts_weights,
+        fc1_experts_bias,
+        fc2_experts_weights,
+        fc2_experts_bias_all,
+        fc3_experts_weights,
+        local_experts_start_index,
+    )
+
+    run_ort_with_parity_check(
+        onnx_model_full,
+        onnx_model_local,
+        num_rows,
+        hidden_size,
+        num_experts,
+        inter_size,
+        threshold,
+    )
+
+
 class TestMoE(unittest.TestCase):
-    def test_moe_expert_slicing(self):
-        for hidden_size in [16, 128]:
-            for inter_size in [512, 1024]:
-                for num_experts in [8, 16, 32]:
-                    for num_rows in [16, 128, 512]:
-                        test_moe_with_expert_slicing(
+    def test_moe_parallelism(self):
+        for hidden_size in [128, 1024]:
+            for inter_size in [512, 2048]:
+                for num_experts in [64]:
+                    for num_rows in [1024]:
+                        print_out("EP")
+                        test_moe_with_expert_parallelism(
+                            hidden_size,
+                            inter_size,
+                            num_experts,
+                            num_rows,
+                        )
+                        print_out("TP")
+                        test_moe_with_tensor_parallelism(
                             hidden_size,
                             inter_size,
                             num_experts,
diff --git a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
new file mode 100644
index 0000000000000..90b7da255081a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py
@@ -0,0 +1,365 @@
+# --------------------------------------------------------------------------
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+from collections import OrderedDict
+
+import numpy
+import torch
+import torch.nn.functional as F
+from onnx import TensorProto, helper
+from torch import nn
+
+import onnxruntime
+
+torch.manual_seed(42)
+numpy.random.seed(42)
+
+ORT_DTYPE = TensorProto.FLOAT
+NP_TYPE = numpy.float16 if ORT_DTYPE == TensorProto.FLOAT16 else numpy.float32
+THRESHOLD = 3e-2
+
+
+def value_string_of(numpy_array):
+    arr = numpy_array.flatten()
+    lines = ["f, ".join([str(v) for v in arr[i : min(i + 8, arr.size)]]) for i in range(0, arr.size, 8)]
+    return "{\n    " + "f,\n    ".join(lines) + "f}"
+
+
+def print_tensor(name, numpy_array):
+    print(f"const std::vector<float> {name} = {value_string_of(numpy_array)};")
+
+
+def create_moe_onnx_graph(
+    num_rows,
+    num_experts,
+    hidden_size,
+    inter_size,
+    fc1_experts_weights,
+    fc2_experts_weights,
+    fc3_experts_weights,
+    topk,
+):
+    nodes = [
+        helper.make_node(
+            "MoE",
+            [
+                "input",
+                "router_probs",
+                "fc1_experts_weights",
+                "",
+                "fc2_experts_weights",
+                "",
+                "fc3_experts_weights",
+            ],
+            ["output"],
+            "MoE_0",
+            k=topk,
+            normalize_routing_weights=1,
+            activation_type="silu",
+            domain="com.microsoft",
+        ),
+    ]
+
+    fc1_shape = [num_experts, hidden_size, inter_size]
+    fc2_shape = [num_experts, inter_size, hidden_size]
+    fc3_shape = [num_experts, hidden_size, inter_size]
+
+    torch_type = torch.float16 if ORT_DTYPE == TensorProto.FLOAT16 else torch.float32
+
+    initializers = [
+        helper.make_tensor(
+            "fc1_experts_weights",
+            ORT_DTYPE,
+            fc1_shape,
+            fc1_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc2_experts_weights",
+            ORT_DTYPE,
+            fc2_shape,
+            fc2_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+        helper.make_tensor(
+            "fc3_experts_weights",
+            ORT_DTYPE,
+            fc3_shape,
+            fc3_experts_weights.to(torch_type).flatten().tolist(),
+            raw=False,
+        ),
+    ]
+
+    graph_inputs = [
+        helper.make_tensor_value_info("input", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph_inputs.append(
+        helper.make_tensor_value_info(
+            "router_probs",
+            ORT_DTYPE,
+            [num_rows, num_experts],
+        )
+    )
+
+    graph_outputs = [
+        helper.make_tensor_value_info("output", ORT_DTYPE, [num_rows, hidden_size]),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "MoE_Graph",
+        graph_inputs,
+        graph_outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "silu": nn.SiLU,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+class MixtralConfig:
+    def __init__(
+        self,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        rope_theta=1e6,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+
+class MixtralBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        current_hidden_states_1 = self.act_fn(self.w1(hidden_states))
+        current_hidden_states_3 = self.w3(hidden_states)
+        current_hidden_states = current_hidden_states_1 * current_hidden_states_3
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accommodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config, batch_size, sequence_length):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+
+        self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+
+        w1_list = []
+        w2_list = []
+        w3_list = []
+        for i in range(self.num_experts):
+            w1_list.append(self.experts[i].w1.weight.transpose(0, 1))
+            w2_list.append(self.experts[i].w2.weight.transpose(0, 1))
+            w3_list.append(self.experts[i].w3.weight.transpose(0, 1))
+
+        self.moe_experts_weight1 = torch.stack(w1_list, dim=0)
+        self.moe_experts_weight2 = torch.stack(w2_list, dim=0)
+        self.moe_experts_weight3 = torch.stack(w3_list, dim=0)
+
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.moe_onnx_graph = create_moe_onnx_graph(
+            self.batch_size * self.sequence_length,
+            self.num_experts,
+            self.hidden_dim,
+            self.ffn_dim,
+            self.moe_experts_weight1,
+            self.moe_experts_weight2,
+            self.moe_experts_weight3,
+            self.top_k,
+        )
+
+        self.ort_sess = self.create_ort_session()
+
+    def create_ort_session(self):
+        from onnxruntime import InferenceSession, SessionOptions
+
+        sess_options = SessionOptions()
+
+        cuda_providers = ["CUDAExecutionProvider"]
+        if cuda_providers[0] not in onnxruntime.get_available_providers():
+            return None
+
+        sess_options.log_severity_level = 2
+        ort_session = InferenceSession(self.moe_onnx_graph, sess_options, providers=["CUDAExecutionProvider"])
+
+        return ort_session
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states  # , router_logits
+
+    def ort_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        ort_inputs = {
+            "input": numpy.ascontiguousarray(hidden_states.detach().numpy().astype(NP_TYPE)),
+            "router_probs": numpy.ascontiguousarray(router_logits.detach().numpy().astype(NP_TYPE)),
+        }
+
+        ort_output = None
+        if self.ort_sess is not None:
+            ort_output = self.ort_sess.run(None, ort_inputs)
+            return torch.tensor(ort_output).reshape(batch_size, sequence_length, -1)  # , router_logits
+
+        # print_tensor("input", ort_inputs["input"])
+        # print_tensor("router_probs", ort_inputs["router_probs"])
+        # print_tensor("fc1_experts_weights", self.moe_experts_weight1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts_weight2.detach().numpy())
+        # print_tensor("fc3_experts_weights", self.moe_experts_weight3.detach().numpy())
+        # print_tensor("output", ort_output[0])
+
+        return None
+
+    def parity_check(self):
+        hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim)
+        torch_output = self.forward(hidden_state)
+        ort_output = self.ort_forward(hidden_state)
+        if ort_output is not None:
+            assert torch.allclose(torch_output, ort_output, rtol=1e-04, atol=1e-04)
+            print(
+                "batch_size:",
+                self.batch_size,
+                " sequence_length:",
+                self.sequence_length,
+                " max_diff:",
+                (torch_output - ort_output).abs().max(),
+                " parity: OK",
+            )
+
+
+class TestMixtralMoE(unittest.TestCase):
+    def test_mixtral_moe_parity(self):
+        for batch_size in [1, 16]:
+            for sequence_length in [128, 1024]:
+                # use a small sizes to speed up the test
+                config = MixtralConfig(hidden_size=256, intermediate_size=1024)
+                mixtral_moe = MixtralSparseMoeBlock(config, batch_size, sequence_length)
+                mixtral_moe.parity_check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py
index 72ca5d9975c05..dbf6ee7dabb0e 100644
--- a/onnxruntime/test/python/transformers/test_parity_moe.py
+++ b/onnxruntime/test/python/transformers/test_parity_moe.py
@@ -47,8 +47,8 @@ def create_moe_onnx_graph(
     hidden_size,
     inter_size,
     fc1_experts_weights,
-    fc2_experts_weights,
     fc1_experts_bias,
+    fc2_experts_weights,
     fc2_experts_bias,
 ):
     nodes = [
@@ -58,8 +58,8 @@ def create_moe_onnx_graph(
                 "input",
                 "router_probs",
                 "fc1_experts_weights",
-                "fc2_experts_weights",
                 "fc1_experts_bias",
+                "fc2_experts_weights",
                 "fc2_experts_bias",
             ],
             ["output"],
@@ -250,8 +250,8 @@ def __init__(
             in_features,
             hidden_features,
             self.moe_experts.weight1,
-            self.moe_experts.weight2,
             self.moe_experts.bias1,
+            self.moe_experts.weight2,
             self.moe_experts.bias2,
         )
 
@@ -296,8 +296,6 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
             ).data_ptr(),
         )
 
-        iobinding.synchronize_inputs()
-
         iobinding.bind_output(
             name="output",
             device_type="cuda",
@@ -308,11 +306,12 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000):
                 numpy.zeros(ort_inputs["input"].shape), "cuda", device_id
             ).data_ptr(),
         )
-        iobinding.synchronize_outputs()
 
         s = time.time()
         for _ in range(repeat):
+            iobinding.synchronize_inputs()
             self.ort_sess.run_with_iobinding(iobinding)
+            iobinding.synchronize_outputs()
         e = time.time()
         print(f"MoE cuda kernel time: {(e - s) / repeat * 1000} ms")
 
@@ -356,8 +355,8 @@ def onnx_forward(self, iobinding=False):
         # print_tensor("input", ort_inputs["input"])
         # print_tensor("router_probs", ort_inputs["router_probs"])
         # print_tensor("fc1_experts_weights", self.moe_experts.weight1.detach().numpy())
-        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc1_experts_bias", self.moe_experts.bias1.detach().numpy())
+        # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy())
         # print_tensor("fc2_experts_bias", self.moe_experts.bias2.detach().numpy())
         # print_tensor("output", ort_output[0])
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 91453102d406f..52dd2a84e383b 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -208,7 +208,7 @@ static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/mod
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx");
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx");
 static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_bar_1.onnx");
-static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
+static constexpr PATH_TYPE OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx");
 static constexpr PATH_TYPE VARIADIC_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/custom_op_variadic_io.onnx");
 static constexpr PATH_TYPE VARIADIC_UNDEF_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR(
     "testdata/custom_op_variadic_undef_io.onnx");
@@ -1082,7 +1082,7 @@ TEST(CApiTest, invalid_variadic_input_homogeneity_custom_op) {
   }
 }
 
-TEST(CApiTest, optional_input_output_custom_op_handler) {
+TEST(CApiTest, optional_input_custom_op_handler) {
   MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider};
 
   // `MyCustomOpFooBar` defines a custom op with atmost 3 inputs and the second input is optional.
@@ -1147,7 +1147,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
   {
     std::vector<const char*> input_names = {"X1", "X2"};
     ort_inputs.erase(ort_inputs.begin() + 2);  // remove the last input in the container
-    Ort::Session session(*ort_env, OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2, session_options);
+    Ort::Session session(*ort_env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2, session_options);
     auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
                                    &output_name, 1);
     ASSERT_EQ(ort_outputs.size(), 1u);
@@ -1166,6 +1166,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) {
     }
   }
 }
+
 TEST(CApiTest, custom_op_with_attributes_handler) {
   MyCustomOpWithAttributes custom_op{onnxruntime::kCpuExecutionProvider};
 
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index ca089c42032b1..f120bf9968558 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -493,9 +493,12 @@
         "test_range_float_type_positive_delta_expanded_cpu", // Error but not a failure.
         "test_scan_sum_cpu", // Disabled due to output mismatch with tolerance.
         "test_scan9_sum_cpu", // Disabled due to output mismatch with tolerance.
-        "test_reduce_max_bool_inputs_cpu"
+        "test_reduce_max_bool_inputs_cpu",
+        "test_gelu_default_1_cpu", // Disabled due to accuracy mismatch
+        "test_gelu_default_2_cpu"
+        
     ],
-    "current_failing_tests_OPENVINO_NPU_FP16": [
+    "current_failing_tests_OPENVINO_NPU": [
         "^test_prelu_broadcast",
         "test_loop11_cpu"
     ],
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index c12a52c4356aa..6ad2d41edb562 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,7 +8,7 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
-#if defined(ENABLE_CUDA_NHWC_OPS)
+#ifdef USE_CUDA
 #include <core/providers/cuda/cuda_provider_options.h>
 #endif
 #include "core/session/onnxruntime_cxx_api.h"
@@ -113,8 +113,9 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-  OrtCUDAProviderOptions provider_options{};
+  OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
 #endif
@@ -126,6 +127,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
+  provider_options.use_tf32 = false;
   provider_options.prefer_nhwc = true;
   if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
     return factory->CreateProvider();
diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json
index bfa000fda440a..1beaf3b83ca28 100644
--- a/onnxruntime/test/wasm/package-lock.json
+++ b/onnxruntime/test/wasm/package-lock.json
@@ -520,9 +520,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -1972,9 +1972,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "fs-extra": {
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index cbc60c70b57aa..90d8b737252e5 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -4,39 +4,27 @@
 'use strict';
 
 /**
- * Mount external data files of a model to the virtual file system (MEMFS).
+ * Mount external data files of a model to an internal map, which will be used during session initialization.
  *
  * @param {string} externalDataFilesPath
  * @param {Uint8Array} externalDataFilesData
  */
 Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
   const files = Module.MountedFiles || (Module.MountedFiles = new Map());
-    files.set(externalDataFilePath, externalDataFileData);
+  files.set(externalDataFilePath, externalDataFileData);
 };
 
 /**
- * Unmount external data files of a model from the virtual file system (MEMFS).
+ * Unmount external data files of a model.
  */
 Module['unmountExternalData'] = () => {
   delete Module.MountedFiles;
 };
 
 /**
- * init JSEP
+ * initialize JSEP for asyncify support.
  */
-Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, runKernel, captureBegin, captureEnd, replay) => {
-  Module.jsepBackend = backend;
-  Module.jsepAlloc = alloc;
-  Module.jsepFree = free;
-  Module.jsepCopy = copy;
-  Module.jsepCopyAsync = copyAsync;
-  Module.jsepCreateKernel = createKernel;
-  Module.jsepReleaseKernel = releaseKernel;
-  Module.jsepRunKernel = runKernel;
-  Module.jsepCaptureBegin = captureBegin;
-  Module.jsepCaptureEnd = captureEnd;
-  Module.jsepReplay = replay;
-
+let jsepInitAsync = () => {
   // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1)
   // It removes some overhead in cwarp() and ccall() that we don't need.
   //
@@ -143,7 +131,7 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
         }
 
         // Flush the backend. This will submit all pending commands to the GPU.
-        backend['flush']();
+        Module.jsepBackend?.['flush']();
 
         // Await all pending promises. This includes GPU validation promises for diagnostic purposes.
         const errorPromises = state.errors;
@@ -180,20 +168,46 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea
       () => Module['_OrtBindInput'],
       v => Module['_OrtBindInput'] = v);
 
-  // expose webgpu backend functions
-  Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
-    return backend['registerBuffer'](sessionId, index, buffer, size);
-  };
-  Module['jsepGetBuffer'] = (dataId) => {
-    return backend['getBuffer'](dataId);
-  };
-  Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
-    return backend['createDownloader'](gpuBuffer, size, type);
-  };
-  Module['jsepOnReleaseSession'] = sessionId => {
-    backend['onReleaseSession'](sessionId);
-  };
-  Module['jsepOnRunStart'] = sessionId => {
-    return backend['onRunStart'](sessionId);
-  };
+  // remove this function to make sure it is called only once.
+  jsepInitAsync = undefined;
+};
+
+
+/**
+ * initialize JSEP for WebGPU.
+ */
+Module['jsepInit'] = (name, params) => {
+  jsepInitAsync?.();
+
+  if (name === 'webgpu') {
+    [Module.jsepBackend,
+     Module.jsepAlloc,
+     Module.jsepFree,
+     Module.jsepCopy,
+     Module.jsepCopyAsync,
+     Module.jsepCreateKernel,
+     Module.jsepReleaseKernel,
+     Module.jsepRunKernel,
+     Module.jsepCaptureBegin,
+     Module.jsepCaptureEnd,
+     Module.jsepReplay] = params;
+
+    // expose webgpu backend functions
+    const backend = Module.jsepBackend;
+    Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => {
+      return backend['registerBuffer'](sessionId, index, buffer, size);
+    };
+    Module['jsepGetBuffer'] = (dataId) => {
+      return backend['getBuffer'](dataId);
+    };
+    Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => {
+      return backend['createDownloader'](gpuBuffer, size, type);
+    };
+    Module['jsepOnReleaseSession'] = sessionId => {
+      backend['onReleaseSession'](sessionId);
+    };
+    Module['jsepOnRunStart'] = sessionId => {
+      return backend['onRunStart'](sessionId);
+    };
+  }
 };
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index e675b55c8af8f..22dcf4eb92411 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -1112,6 +1112,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
 
   ArgDef grad = GO(0);
   if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     if (attributes.find("axes") != attributes.end()) {
       std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
       grad = IA("Unqueezed_Grad");
@@ -1122,6 +1123,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
         result.push_back(axes_values_node);
         result.push_back(NodeDef(OpDef{"Unsqueeze", kOnnxDomain, 13}, {GO(0), axes_values_node.output_args[0]}, {grad}));
       }
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
     }
   }
 
@@ -1152,12 +1156,21 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) {
   }
 
   ArgDef grad = GO(0);
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
-    grad = IA("Unsqueezed_Grad");
-    result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      grad = IA("Unsqueezed_Grad");
 
-    result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+      result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      grad = IA("Unsqueezed_Grad");
+      result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad}));
+
+      result.push_back(NodeDef("Unsqueeze", {O(0), I(1)}, {IA("Unsqueezed_Output")}));
+    }
     result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")}));
   } else {
     result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")}));
@@ -1188,11 +1201,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceL2Gradient) {
   ArgDef scaled_dy_arg_def = IA("Masked_Scaled_dY");
   result.emplace_back(NodeDef("Where", {IA("Masked_Y"), ZERO, IA("Scaled_dY")}, {scaled_dy_arg_def}));
 
-  if (!keepdims && attributes.find("axes") != attributes.end()) {
-    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+  if (!keepdims) {
+    size_t numInputs = GetSrcNodeInputSize();
     scaled_dy_arg_def = IA("Unsqueezed_Masked_Scaled_dY");
-    result.emplace_back(
-        NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    if (attributes.find("axes") != attributes.end()) {
+      std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)}));
+    } else if (numInputs == 2) {  // optional input 'axes' is available as input I(1)
+      result.emplace_back(
+          NodeDef("Unsqueeze", {IA("Masked_Scaled_dY"), I(1)}, {scaled_dy_arg_def}));
+    }
   }
 
   result.emplace_back(NodeDef("Mul", {I(0), scaled_dy_arg_def}, {GI(0)}));
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index feca94ae27c13..94ca96c68f2ce 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -607,6 +607,10 @@ TEST(GradientCheckerTest, ReduceMeanGrad) {
 
   OpDef op_def_opset13{"ReduceMean", kOnnxDomain, 13};
   RunReductionTests(op_def_opset13);
+
+  // axes is input from opset 18.
+  OpDef op_def_opset18{"ReduceMean", kOnnxDomain, 18};
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceSumGrad) {
@@ -619,6 +623,10 @@ TEST(GradientCheckerTest, ReduceSumGrad) {
   OpDef op_def_13{"ReduceSum", kOnnxDomain, 13};
 
   RunReductionTests(op_def_13, true, true);
+
+  OpDef op_def_18{"ReduceSum", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceL2Grad) {
@@ -641,6 +649,11 @@ TEST(GradientCheckerTest, ReduceL2Grad) {
                                                            {MakeAttribute("axes", axes)}));
     EXPECT_IS_TINY(max_error);
   }
+
+  // axes is input from opset 18
+  OpDef op_def_18{"ReduceL2", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_18, true, true);
 }
 
 TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
@@ -648,6 +661,10 @@ TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
   OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
 
   RunReductionTests(op_def);
+
+  OpDef op_def_opset18{"ReduceLogSumExp", kOnnxDomain, 18};
+
+  RunReductionTests(op_def_opset18, true, true);
 }
 
 TEST(GradientCheckerTest, ReluGrad) {
@@ -698,6 +715,13 @@ TEST(GradientCheckerTest, SplitGrad) {
   ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_13, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
                                                          {MakeAttribute("axis", int64_t(0))}));
   EXPECT_IS_TINY(max_error);
+
+  // opset18 test
+  OpDef op_def_18{"Split", kOnnxDomain, 18};
+  ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_18, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error,
+                                                         {MakeAttribute("axis", int64_t(0)),
+                                                          MakeAttribute("num_outputs", int64_t(3))}));
+  EXPECT_IS_TINY(max_error);
 }
 
 template <typename T>
@@ -2733,7 +2757,7 @@ TEST(GradientCheckerTest, TileGrad) {
 TEST(GradientCheckerTest, PadGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"Pad", kOnnxDomain, 11};
+  OpDef op_def{"Pad", kOnnxDomain, 18};
 
   {
     TensorInfo x_info({2, 4}, true);
@@ -2803,7 +2827,7 @@ TEST(GradientCheckerTest, PadGrad) {
 TEST(GradientCheckerTest, ScatterNDGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterND", kOnnxDomain, 11};
+  OpDef op_def{"ScatterND", kOnnxDomain, 18};
 
   {
     TensorInfo data_info({8}, true);
@@ -2887,7 +2911,7 @@ TEST(GradientCheckerTest, ScatterNDGrad) {
 TEST(GradientCheckerTest, ScatterElementsGrad) {
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
-  OpDef op_def{"ScatterElements", kOnnxDomain, 13};
+  OpDef op_def{"ScatterElements", kOnnxDomain, 18};
 
   {  // without axis
     TensorInfo data_info({3, 3}, true);
diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
index bfb59f1525e47..18c1364f5d1f6 100644
--- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
@@ -144,6 +144,8 @@ TEST(OptimizerTest, AdamBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4634f, -0.6416f, -1.2121f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
 
@@ -167,6 +169,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0NoBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-3.6210f, -2.8075f, -3.3723f});
   test.AddOutput<float>("G_Out", {3}, {-3.1576f, -3.1658f, -3.1601f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -191,6 +195,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0WithBiasCorrection) {
   test.AddOutput<float>("W_Out", {3}, {-1.4587f, -0.6452f, -1.2099f});
   test.AddOutput<float>("G_Out", {3}, {-0.9954f, -1.0036f, -0.9979f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
@@ -214,6 +220,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1NoBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-3.5894f, -2.7758f, -3.3406f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -237,6 +245,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1WithBiasCorrection) {
   test.AddOutput<float>("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f});
   test.AddOutput<float>("W_Out", {3}, {-1.4488f, -0.6352f, -1.1999f});
 
+  test.SetOutputTolerance(0.0001f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(1));
   test.AddAttribute("lambda", 0.01f);
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(1));
@@ -368,6 +378,11 @@ TEST(OptimizerTest, AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test) {
   test.AddOptionalOutputEdge<MLFloat16>();
   test.AddOutput<MLFloat16>("FP16_W_Out", {3}, w_new_half);
 
+  test.SetOutputAbsErr("Moment_1_Out", 0.005f);
+  test.SetOutputAbsErr("Moment_2_Out", 0.005f);
+  test.SetOutputAbsErr("W_Out", 0.001f);
+  test.SetOutputAbsErr("FP16_W_Out", 0.005f);
+
   test.AddAttribute("do_bias_correction", static_cast<int64_t>(0));
   test.AddAttribute("weight_decay_mode", static_cast<int64_t>(0));
   test.AddAttribute("max_norm_clip", 0.001f);
@@ -617,6 +632,8 @@ void run_lamb_test_with_baseline(
     test.AddOptionalOutputEdge<MLFloat16>();
   }
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
@@ -737,6 +754,8 @@ void run_multi_tensor_lamb_test_with_baseline(
   test.AddAttribute("ratio_min", ratio_min);
   test.AddAttribute("ratio_max", ratio_max);
 
+  test.SetOutputTolerance(0.005f);
+
   test.Run();
 }
 
diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
index e9795a24681cb..e89883bfd4d94 100644
--- a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc
@@ -37,6 +37,8 @@ TEST(BatchNormInternalTest, ForwardTrainingTest) {
   test.AddOutput<float>("saved_mean", channel_dims, {-0.306f, 0.114562f});
   test.AddOutput<float>("saved_inv_std", channel_dims, {1.2288f, 0.861317f});
 
+  test.SetOutputTolerance(0.0001f);
+
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
diff --git a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
index 6335a666e0381..d842d4f1ea736 100644
--- a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc
@@ -68,6 +68,7 @@ static void TestBatchNormInternal(bool test_double = false, bool T_is_half = fal
     test.AddOutput<double>("running_var", channel_dims, running_var_double);
     test.AddOutput<double>("saved_mean", channel_dims, saved_mean_double);
     test.AddOutput<double>("saved_inv_std", channel_dims, saved_inv_std_double);
+    test.SetOutputTolerance(0.0001f);
   } else {
     if (T_is_half) {
       std::vector<MLFloat16> X_half(X.size());
diff --git a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
index e86aa871b6c5f..13ad2f6150acf 100644
--- a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc
@@ -49,7 +49,7 @@ static void TestLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);
@@ -152,7 +152,7 @@ static void TestInvertibleLayerNormGrad(
 
   test.AddAttribute("axis", axis);
 
-  RandomValueGenerator random{};
+  RandomValueGenerator random{optional<RandomValueGenerator::RandomSeedType>{2345}};
   const auto Y_grad_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto X_data = random.Uniform<float>(n_x_m_dims, k_random_data_min, k_random_data_max);
   const auto scale_data = random.Uniform<float>(m_dims, k_random_data_min, k_random_data_max);
diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
index f604e4c4aaf3e..c642a87e22de6 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
@@ -233,6 +233,7 @@ void NcclService::Initialize() {
   //   CPUs
   //   Other devices
 
+#ifdef USE_MPI
   const int mpi_rank = onnxruntime::training::MPIContext::GetInstance().GetWorldRank();
   const int mpi_local_rank = onnxruntime::training::MPIContext::GetInstance().GetLocalRank();
   const int mpi_size = onnxruntime::training::MPIContext::GetInstance().GetWorldSize();
@@ -248,6 +249,7 @@ void NcclService::Initialize() {
   if (mpi_rank == 0) NCCL_CALL_THROW(ncclGetUniqueId(&id));
   MPI_CHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
   NCCL_CALL_THROW(ncclCommInitRank(&comm_, mpi_size, id, mpi_rank));
+#endif  // USE_MPI
 }
 
 void NcclService::Launch() {
diff --git a/setup.py b/setup.py
index ac7a70b991fbf..ffe2958b357b8 100644
--- a/setup.py
+++ b/setup.py
@@ -232,6 +232,8 @@ def run(self):
 
                 tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"]
 
+                cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"]
+
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):
                     subprocess.run(
@@ -255,7 +257,7 @@ def run(self):
                 file = glob(path.join(self.dist_dir, "*linux*.whl"))[0]
                 logger.info("repairing %s for manylinux1", file)
                 auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
-                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
+                for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies + cann_dependencies:
                     auditwheel_cmd += ["--exclude", i]
                 logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd]))
                 try:
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 067f151844b1b..3c1bdfc54c12e 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -73,13 +73,14 @@ def _str_to_bool(s):
 
 
 def _openvino_verify_device_type(device_read):
-    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"]
+    choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16", "NPU"]
 
     choices1 = [
         "CPU_FP32_NO_PARTITION",
         "CPU_FP16_NO_PARTITION",
         "GPU_FP32_NO_PARTITION",
         "GPU_FP16_NO_PARTITION",
+        "NPU_NO_PARTITION",
     ]
     status_hetero = True
     res = False
@@ -94,7 +95,7 @@ def _openvino_verify_device_type(device_read):
         if len(comma_separated_devices) < 2:
             print("At least two devices required in Hetero/Multi/Auto Mode")
             status_hetero = False
-        dev_options = ["CPU", "GPU"]
+        dev_options = ["CPU", "GPU", "NPU"]
         for dev in comma_separated_devices:
             if dev not in dev_options:
                 status_hetero = False
@@ -105,7 +106,7 @@ def invalid_hetero_build():
         print("specify the keyword HETERO or MULTI or AUTO followed by the devices ")
         print("in the order of priority you want to build\n")
         print("The different hardware devices that can be added in HETERO or MULTI or AUTO")
-        print("are ['CPU','GPU'] \n")
+        print("are ['CPU','GPU','NPU'] \n")
         print("An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n")
         print("An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n")
         print("An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n")
@@ -400,6 +401,12 @@ def convert_arg_line_to_args(self, arg_line):
 
     parser.add_argument("--ios", action="store_true", help="build for ios")
 
+    parser.add_argument(
+        "--macos",
+        choices=["MacOSX", "Catalyst"],
+        help="Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present.",
+    )
+
     parser.add_argument(
         "--apple_sysroot", default="", help="Specify the location name of the macOS platform SDK to be used"
     )
@@ -419,7 +426,7 @@ def convert_arg_line_to_args(self, arg_line):
         action="store_const",
         const="Xcode",
         dest="cmake_generator",
-        help="Use Xcode as cmake generator, this is only supported on MacOS. Equivalent to '--cmake_generator Xcode'.",
+        help="Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to '--cmake_generator Xcode'.",
     )
     parser.add_argument(
         "--osx_arch",
@@ -1220,6 +1227,7 @@ def generate_build_tree(
             "-Donnxruntime_USE_OPENVINO_GPU_FP16=" + ("ON" if args.use_openvino == "GPU_FP16" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP32=" + ("ON" if args.use_openvino == "CPU_FP32" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16=" + ("ON" if args.use_openvino == "CPU_FP16" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU=" + ("ON" if args.use_openvino == "NPU" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP32_NP="
             + ("ON" if args.use_openvino == "GPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_GPU_FP16_NP="
@@ -1228,6 +1236,7 @@ def generate_build_tree(
             + ("ON" if args.use_openvino == "CPU_FP32_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_CPU_FP16_NP="
             + ("ON" if args.use_openvino == "CPU_FP16_NO_PARTITION" else "OFF"),
+            "-Donnxruntime_USE_OPENVINO_NPU_NP=" + ("ON" if args.use_openvino == "NPU_NO_PARTITION" else "OFF"),
             "-Donnxruntime_USE_OPENVINO_HETERO=" + ("ON" if args.use_openvino.startswith("HETERO") else "OFF"),
             "-Donnxruntime_USE_OPENVINO_DEVICE=" + (args.use_openvino),
             "-Donnxruntime_USE_OPENVINO_MULTI=" + ("ON" if args.use_openvino.startswith("MULTI") else "OFF"),
@@ -1323,8 +1332,12 @@ def generate_build_tree(
     if args.use_snpe:
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
-    if args.build_apple_framework or args.ios:
-        if not args.cmake_generator == "Xcode":
+    if args.macos or args.ios:
+        # Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
+        if args.macos == "Catalyst" and args.cmake_generator == "Xcode":
+            raise BuildError("Xcode CMake generator ('--cmake_generator Xcode') doesn't support Mac Catalyst build.")
+
+        if (args.ios or args.macos == "MacOSX") and not args.cmake_generator == "Xcode":
             raise BuildError(
                 "iOS/MacOS framework build requires use of the Xcode CMake generator ('--cmake_generator Xcode')."
             )
@@ -1342,12 +1355,15 @@ def generate_build_tree(
                 "iOS/MacOS framework build on MacOS canceled due to missing arguments: "
                 + ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond)
             )
+        # note: this value is mainly used in framework_info.json file to specify the build osx type
+        platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot
         cmake_args += [
             "-Donnxruntime_BUILD_SHARED_LIB=ON",
             "-DCMAKE_OSX_SYSROOT=" + args.apple_sysroot,
             "-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target,
             # we do not need protoc binary for ios cross build
             "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF",
+            "-DPLATFORM_NAME=" + platform_name,
         ]
         if args.ios:
             cmake_args += [
@@ -1355,6 +1371,21 @@ def generate_build_tree(
                 "-DCMAKE_TOOLCHAIN_FILE="
                 + (args.ios_toolchain_file if args.ios_toolchain_file else "../cmake/onnxruntime_ios.toolchain.cmake"),
             ]
+        # for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc.
+        # https://forums.developer.apple.com/forums/thread/122571
+        if args.macos == "Catalyst":
+            macabi_target = f"{args.osx_arch}-apple-ios{args.apple_deploy_target}-macabi"
+            cmake_args += [
+                "-DCMAKE_CXX_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_C_COMPILER_TARGET=" + macabi_target,
+                "-DCMAKE_CC_COMPILER_TARGET=" + macabi_target,
+                f"-DCMAKE_CXX_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_C_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS=--target={macabi_target}",
+                f"-DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
+            ]
 
     if args.build_wasm:
         emsdk_dir = os.path.join(cmake_dir, "external", "emsdk")
@@ -2740,7 +2771,13 @@ def main():
             cmake_extra_args += ["-G", args.cmake_generator]
 
         if is_macOS():
-            if not args.ios and not args.android and args.osx_arch == "arm64" and platform.machine() == "x86_64":
+            if (
+                not args.ios
+                and args.macos != "Catalyst"
+                and not args.android
+                and args.osx_arch == "arm64"
+                and platform.machine() == "x86_64"
+            ):
                 if args.test:
                     log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.")
                     args.test = False
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index 7b8a87632f5c7..e17bcd65d8814 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -50,9 +50,11 @@ def _build_for_apple_sysroot(
     # Build binary for each arch, one by one
     for current_arch in archs:
         build_dir_current_arch = os.path.join(intermediates_dir, sysroot + "_" + current_arch)
+        # Use MacOS SDK for Catalyst builds
+        apple_sysroot = "macosx" if sysroot == "macabi" else sysroot
         build_command = [
             *base_build_command,
-            "--apple_sysroot=" + sysroot,
+            "--apple_sysroot=" + apple_sysroot,
             "--osx_arch=" + current_arch,
             "--build_dir=" + build_dir_current_arch,
         ]
diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 86b4efdc63750..04a73ae450e5f 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -23,6 +23,7 @@
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ],
         "iphoneos": [
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index 445bfca9889ff..4bc978956d7fc 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -6,25 +6,35 @@
         "iphonesimulator": [
             "arm64",
             "x86_64"
+        ],
+        "macabi": [
+            "arm64",
+            "x86_64"
         ]
     },
     "build_params": {
         "base": [
             "--parallel",
-            "--use_xcode",
             "--build_apple_framework",
             "--use_coreml",
-            "--use_xnnpack",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
         "iphoneos": [
             "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
             "--apple_deploy_target=12.0"
         ],
         "iphonesimulator": [
             "--ios",
+            "--use_xcode",
+            "--use_xnnpack",
             "--apple_deploy_target=12.0"
+        ],
+        "macabi":[
+            "--macos=Catalyst",
+            "--apple_deploy_target=14.0"
         ]
     }
 }
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
index f88934cd44a66..2066af7843e0a 100644
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
@@ -32,6 +32,7 @@
             "--apple_deploy_target=12.0"
         ],
         "macosx": [
+            "--macos=MacOSX",
             "--apple_deploy_target=11.0"
         ]
     }
diff --git a/tools/ci_build/github/apple/framework_info.json.template b/tools/ci_build/github/apple/framework_info.json.template
index b4c4fb8d16ebf..1f7eeb5948799 100644
--- a/tools/ci_build/github/apple/framework_info.json.template
+++ b/tools/ci_build/github/apple/framework_info.json.template
@@ -1,5 +1,5 @@
 {
-    "@CMAKE_OSX_SYSROOT@": {
+    "@PLATFORM_NAME@": {
         "APPLE_DEPLOYMENT_TARGET": "@CMAKE_OSX_DEPLOYMENT_TARGET@",
         "WEAK_FRAMEWORK": "@APPLE_WEAK_FRAMEWORK@"
     }
diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py
index 3c0df994ffd3d..3987a37fcc76c 100644
--- a/tools/ci_build/github/apple/test_apple_packages.py
+++ b/tools/ci_build/github/apple/test_apple_packages.py
@@ -176,6 +176,25 @@ def _test_apple_packages(args):
 
                 break
 
+            if args.mac_catalyst_enabled:
+                subprocess.run(
+                    [
+                        "xcrun",
+                        "xcodebuild",
+                        "test",
+                        "-workspace",
+                        "./apple_package_test.xcworkspace",
+                        "-scheme",
+                        "ios_package_test",
+                        "-destination",
+                        "platform=macOS,variant=Mac Catalyst",
+                        "CODE_SIGNING_ALLOWED=NO",
+                    ],
+                    shell=False,
+                    check=True,
+                    cwd=target_proj_path,
+                )
+
             if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test:
                 subprocess.run(
                     [
@@ -244,6 +263,12 @@ def parse_args():
         help="Skip macos platform tests. Specify this argument when build targets only contain ios archs. ",
     )
 
+    parser.add_argument(
+        "--mac_catalyst_enabled",
+        action="store_true",
+        help="Run tests for mac catalyst variants. Specify this argument when build targets contains catalyst archs. ",
+    )
+
     return parser.parse_args()
 
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 15f558e6f9ef0..af2d722a6b90c 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -28,10 +28,15 @@ parameters:
     - "partner-models"
 
 - name: MemTest
-  displayName: Run Memory Test
+  displayName: Run Memory Test and Concurrency Test
   type: boolean
   default: true
 
+- name: ConcurrencyTest
+  displayName: Specifies the number of concurrency model test to invoke simultaneously
+  type: string
+  default: 2
+
 - name: TrtEPOptions
   displayName: TensorRT EP options
   type: object
@@ -107,8 +112,8 @@ jobs:
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build'
     
     - ${{ if eq(parameters.MemTest, true) }}:
-      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false'
-        displayName: 'Run Memory Test'
+      - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}'
+        displayName: 'Run Memory Test and Concurrency Test'
         workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/'
 
     - ${{ each option in parameters.ModelGroups }}:
@@ -152,7 +157,7 @@ jobs:
         displayName: 'Check and Install Azure CLI'
 
       - task: AzureCLI@2
-        displayName: 'Azure CLI Post to Dashboard'
+        displayName: 'Post EP Perf Results to Dashboard'
         inputs:
           azureSubscription: AIInfraBuildOnnxRuntimeOSS
           scriptLocation: inlineScript
@@ -160,8 +165,8 @@ jobs:
           inlineScript: |
             short_hash=$(git rev-parse --short HEAD) &&
             commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) &&
-            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
-    
+            python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser)
+
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index 9cf7a3fb42397..8b58d958ba899 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -109,6 +109,7 @@ jobs:
               --rocm_version=$(RocmVersion) \
               --rocm_home /opt/rocm \
               --nccl_home /opt/rocm \
+              --enable_nccl \
               --update \
               --build_dir /build \
               --build \
diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
index c92fc93abba37..03e0274fc198a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml
@@ -32,5 +32,5 @@ jobs:
   parameters:
     AgentPool : 'Linux-CPU-2019'
     JobName: 'Linux_CI_Dev'
-    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2023.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
+    RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2024.0.0 -x "--use_openvino CPU_FP32 --build_wheel"'
     TimeoutInMinutes: 120
diff --git a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
index a3f56f5c448a9..f0a35d809c700 100644
--- a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml
@@ -32,7 +32,7 @@ jobs:
   workspace:
     clean: all
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     MACOSX_DEPLOYMENT_TARGET: '11.0'
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
@@ -43,6 +43,8 @@ jobs:
     displayName: Install coreutils and ninja
 
   - template: templates/use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
 
   - template: templates/mac-build-step-with-cache.yml
     parameters:
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index a1ca68c8279e7..255531681b039 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -30,7 +30,7 @@ pr:
 jobs:
 - job: iOS_CI_on_Mac
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   variables:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/proto_ccache
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache
@@ -39,7 +39,7 @@ jobs:
   steps:
     - template: templates/use-xcode-version.yml
       parameters:
-        xcodeVersion: 14.3
+        xcodeVersion: 14.2
     - template: templates/mac-build-step-with-cache.yml
       parameters:
         WithCache: true
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 5fd15b64e03b6..881023e1c1186 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index d6bb415a68ee6..3a3375a313ca5 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -188,7 +188,7 @@ stages:
           displayName: 'Publish unit test results'
           inputs:
             testResultsFiles: '**\*.results.xml'
-            searchFolder: '$(Build.BinariesDirectory)'
+            searchFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
             testRunTitle: 'Unit Test Run'
           condition: succeededOrFailed()
 
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index bf1ba71b7b818..4ca122f639551 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -46,7 +46,7 @@ stages:
             --build-arg PYTHON_VERSION=$(PythonVersion)
             --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu
             --build-arg BUILD_UID=$(id -u)
-          Repository: onnxruntimetrainingcpubuild
+          Repository: onnxruntimetrainingcpubuild_$(PythonVersion)
 
       - task: CmdLine@2
         displayName: 'build onnxruntime'
@@ -63,7 +63,7 @@ stages:
               -e BUILD_BUILDNUMBER \
               -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \
               -e DEFAULT_TRAINING_PACKAGE_DEVICE \
-              onnxruntimetrainingcpubuild \
+              onnxruntimetrainingcpubuild_$(PythonVersion) \
                 $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                   --build_dir /build --cmake_generator Ninja \
                   --config Debug Release \
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index aee42d3675087..20646d3ba4a26 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -21,6 +21,15 @@ parameters:
     values:
       - 11.8
       - 12.2
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Specific Artifact's BuildId
+    type: string
+    default: '0'
 
 resources:
   repositories:
@@ -36,4 +45,6 @@ stages:
       enable_linux_gpu: ${{ parameters.enable_linux_gpu }}
       enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
       cmake_build_type: ${{ parameters.cmake_build_type }}
-      cuda_version: ${{ parameters.cuda_version }}
\ No newline at end of file
+      cuda_version: ${{ parameters.cuda_version }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index f82c80d4d7e93..a2c1eeef632c1 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -34,72 +34,40 @@ parameters:
    - 11.8
    - 12.2
 
-stages:
-- stage: Python_Packaging
-  dependsOn: []
-  variables:
-  - name: docker_base_image
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
-  - name: linux_trt_version
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: 8.6.1.6-1.cuda11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: 8.6.1.6-1.cuda12.0
-  - name: win_trt_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
-  - name: win_cuda_home
-    ${{ if eq(parameters.cuda_version, '11.8') }}:
-      value: $(Agent.TempDirectory)\v11.8
-    ${{ if eq(parameters.cuda_version, '12.2') }}:
-      value: $(Agent.TempDirectory)\v12.2
-  jobs:
-  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.8'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
-
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.9'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.10'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
 
-      - template: ../templates/py-win-gpu.yml
-        parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.11'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
-          EP_NAME: gpu
-          CudaVersion: ${{ parameters.cuda_version }}
+- name: PythonVersions
+  type: object
+  displayName: 'Python versions to build'
+  default:
+    - '3.8'
+    - '3.9'
+    - '3.10'
+    - '3.11'
+    - '3.12'
 
+stages:
+  - ${{ if eq(parameters.enable_windows_gpu, true) }}:
+    - ${{ each python_version in parameters.PythonVersions }}:
       - template: ../templates/py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
-          PYTHON_VERSION: '3.12'
-          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-
+          SpecificArtifact: ${{ parameters.SpecificArtifact }}
+          BuildId: ${{ parameters.BuildId }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 --cuda_home=$(Agent.TempDirectory)\v12.2  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
@@ -108,6 +76,10 @@ stages:
           machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
-          docker_base_image: ${{ variables.docker_base_image }}
-          trt_version: ${{ variables.linux_trt_version }}
           cuda_version: ${{ parameters.cuda_version }}
+          ${{ if eq(parameters.cuda_version, '11.8') }}:
+            docker_base_image: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda11.8
+          ${{ if eq(parameters.cuda_version, '12.2') }}:
+            docker_base_image: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+            trt_version: 8.6.1.6-1.cuda12.0
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 1ba0b02560aca..0bb9fad6716b7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -138,7 +138,8 @@ stages:
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
           --variant Full \
-          --skip_macos_test
+          --skip_macos_test \
+          --mac_catalyst_enabled
       displayName: "Test Apple framework"
 
     - task: PublishBuildArtifacts@1
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index ac82fe7403811..00534b160728c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.146
+      version: 1.0.147
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.146
+      version: 1.0.147
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
index e788e4b3dddaa..a4d5a73118ea2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml
@@ -31,6 +31,10 @@ steps:
     node -e "a=require('child_process').execSync('git diff --name-only').toString();if(a)throw new Error('Following source files are not formatted: (did you run \"npm run format\"?)\n'+a)"
   workingDirectory: '$(Build.SourcesDirectory)/js'
   displayName: 'Check unformatted files'
+- script: |
+    npx typedoc --emit none --treatWarningsAsErrors
+  workingDirectory: '$(Build.SourcesDirectory)/js/common'
+  displayName: 'TypeDoc Validation'
 - script: |
     npm run build:doc
   workingDirectory: '$(Build.SourcesDirectory)/js/web'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index 080079388a76c..945fbb7c4a094 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -71,7 +71,7 @@ stages:
         ${{ if eq(parameters.DoESRP, true)}}:
           vmImage: 'macOS-12'
         ${{ else }}:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-latest'
       steps:
       - checkout: none
       - template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index fd2113502478a..9e192716c3ffd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-latest'
   timeoutInMinutes: 300
   steps:
   - checkout: self
@@ -55,6 +55,8 @@ jobs:
   - template: set-version-number-variables-step.yml
 
   - template: use-xcode-version.yml
+    parameters:
+      xcodeVersion: 14.2
 
   - template: mac-build-step-with-cache.yml
     parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
index 8cc48aac7a3b9..318ffd21febf5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml
@@ -35,62 +35,66 @@ parameters:
   values:
    - 11.8
    - 12.2
-jobs:
-- job: Linux_py_GPU_Wheels_${{ parameters.arch }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool: ${{ parameters.machine_pool }}
-  variables:
-    # The build machine pool doesn't have dotnet, so it can't run CG.
-    - name: skipComponentGovernanceDetection
-      value: true
-    - name: extra_build_args
-      ${{ if ne(parameters.extra_build_arg, '') }}:
-        value: -x ${{ parameters.extra_build_arg }}
-      ${{ if eq(parameters.extra_build_arg, '') }}:
-        value: ''
-  steps:
-    - checkout: self
-      clean: true
-      submodules: recursive
 
-    - template: set-nightly-build-option-variable-step.yml
+stages:
+- stage: Linux_py_GPU_Wheels_${{ parameters.arch }}
+  dependsOn: []
+  jobs:
+  - job: Linux_py_GPU_Wheels_${{ parameters.arch }}
+    timeoutInMinutes: 240
+    workspace:
+      clean: all
+    pool: ${{ parameters.machine_pool }}
+    variables:
+      # The build machine pool doesn't have dotnet, so it can't run CG.
+      - name: skipComponentGovernanceDetection
+        value: true
+      - name: extra_build_args
+        ${{ if ne(parameters.extra_build_arg, '') }}:
+          value: -x ${{ parameters.extra_build_arg }}
+        ${{ if eq(parameters.extra_build_arg, '') }}:
+          value: ''
+    steps:
+      - checkout: self
+        clean: true
+        submodules: recursive
 
-    - template: get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
-        Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: "
-        --network=host 
-        --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
-        --build-arg TRT_VERSION=${{ parameters.trt_version }}
-        --build-arg BUILD_UID=$( id -u )
-        --build-arg PLATFORM=${{ parameters.arch }}
-        "
-        Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
+      - template: set-nightly-build-option-variable-step.yml
 
+      - template: get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+          Context: tools/ci_build/github/linux/docker
+          DockerBuildArgs: "
+          --network=host
+          --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
+          --build-arg TRT_VERSION=${{ parameters.trt_version }}
+          --build-arg BUILD_UID=$( id -u )
+          --build-arg PLATFORM=${{ parameters.arch }}
+          "
+          Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
 
-    - task: Bash@3
-      displayName: 'Build Python Wheel'
-      inputs:
-        targetType: filePath
-        filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
-        arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishBuildArtifacts@1
-      displayName: 'Publish Artifact: ONNXRuntime python wheel'
-      inputs:
-        PathtoPublish: '$(Build.BinariesDirectory)/dist'
-        ArtifactName: onnxruntime_gpu
+      - task: Bash@3
+        displayName: 'Build Python Wheel'
+        inputs:
+          targetType: filePath
+          filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh
+          arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args)
 
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Test Binaries'
-      inputs:
-        artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
-        targetPath: '$(Build.BinariesDirectory)/Release'
+      - task: PublishBuildArtifacts@1
+        displayName: 'Publish Artifact: ONNXRuntime python wheel'
+        inputs:
+          PathtoPublish: '$(Build.BinariesDirectory)/dist'
+          ArtifactName: onnxruntime_gpu
 
+      - task: PublishPipelineArtifact@0
+        displayName: 'Publish Test Binaries'
+        inputs:
+          artifactName: 'drop-linux-gpu-${{ parameters.arch }}'
+          targetPath: '$(Build.BinariesDirectory)/Release'
 
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
+
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
index 91d7b9f219f76..024b9b45591ba 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml
@@ -172,6 +172,7 @@ stages:
           parameters:
             Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }}
             Context: tools/ci_build/github/linux/docker
+            UpdateDepsTxt: false
             DockerBuildArgs: >-
               --build-arg TORCH_VERSION=${{ parameters.torch_version }}
               --build-arg OPSET_VERSION=${{ parameters.opset_version }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 4315eae503ebd..17915d107dbe6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -1,8 +1,4 @@
 parameters:
-
-- name: MACHINE_POOL
-  type: string
-
 - name: EP_NAME
   type: string
 
@@ -27,169 +23,257 @@ parameters:
   values:
     - 11.8
     - 12.2
-jobs:
-- job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool:
-    name: ${{ parameters.MACHINE_POOL }}
-#    demands:
-#      - ImageVersionOverride -equals 1.0.367516
-  variables:
-    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
-    VSGenerator: 'Visual Studio 17 2022'
-    CUDA_MODULE_LOADING: 'LAZY'
-  steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: telemetry-steps.yml
-
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: ${{ parameters.PYTHON_VERSION }}
-          addToPath: true
-          architecture: 'x64'
-
-      - task: onebranch.pipeline.tsaoptions@1
-        displayName: 'OneBranch TSAOptions'
-        inputs:
-          tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-          appendSourceBranchName: false
-
-      - task: PythonScript@0
-        inputs:
-          scriptSource: inline
-          script: |
-            import sys
-            np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.24.2'
-            import subprocess
-            subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
-          workingDirectory: '$(Build.BinariesDirectory)'
-          displayName: 'Install python modules'
-
-      - template: download-deps.yml
-
-      - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/set-winenv.yml
+
+- name: SpecificArtifact
+  displayName: Use Specific Artifact
+  type: boolean
+  default: false
+
+- name: BuildId
+  displayName: Specific Artifact's BuildId
+  type: string
+  default: '0'
+
+stages:
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    dependsOn: []
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+      timeoutInMinutes: 120
+      workspace:
+        clean: all
+      pool:
+        name: onnxruntime-Win-CPU-2022
+    #    demands:
+    #      - ImageVersionOverride -equals 1.0.367516
+      variables:
+        GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+        VSGenerator: 'Visual Studio 17 2022'
+        CUDA_MODULE_LOADING: 'LAZY'
+      steps:
+          - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+            displayName: 'Clean Agent Directories'
+            condition: always()
+
+          - checkout: self
+            clean: true
+            submodules: recursive
+
+          - template: telemetry-steps.yml
+
+          - task: UsePythonVersion@0
+            inputs:
+              versionSpec: ${{ parameters.PYTHON_VERSION }}
+              addToPath: true
+              architecture: 'x64'
+
+          - task: onebranch.pipeline.tsaoptions@1
+            displayName: 'OneBranch TSAOptions'
+            inputs:
+              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
+              appendSourceBranchName: false
+
+          - task: PythonScript@0
+            inputs:
+              scriptSource: inline
+              script: |
+                import sys
+                np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.26'
+                import subprocess
+                try:
+                  subprocess.check_call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version])
+                except subprocess.CalledProcessError:
+                  sys.exit(1)
+              workingDirectory: '$(Build.BinariesDirectory)'
+              displayName: 'Install python modules'
+
+          - template: download-deps.yml
+
+          - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/set-winenv.yml
+              parameters:
+                EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+
+          - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
+            - template: jobs/download_win_gpu_library.yml
+              parameters:
+                CudaVersion: ${{ parameters.CudaVersion }}
+                ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
+                  DownloadCUDA: true
+                ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
+                  DownloadTRT: true
+
+          - task: PythonScript@0
+            displayName: 'Update deps.txt'
+            inputs:
+              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
+              arguments: --new_dir $(Build.BinariesDirectory)/deps
+              workingDirectory: $(Build.BinariesDirectory)
+
+          - task: PowerShell@2
+            displayName: 'Install ONNX'
+            inputs:
+              filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
+              workingDirectory: '$(Build.BinariesDirectory)'
+              arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
+
+          # it could be removed once there's onnx wheel for python 3.12
+          - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}:
+            - task: PublishPipelineArtifact@1
+              displayName: 'Publish Artifact: ONNX python 12 wheel'
+              inputs:
+                targetPath: '$(Agent.TempDirectory)\onnx\onnx-1.15.0\dist\'
+                publishLocation: 'pipeline'
+                artifactName: onnx_py12_wheel
+
+          - template: set-nightly-build-option-variable-step.yml
+
+          - task: PythonScript@0
+            displayName: 'Generate cmake config'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --skip_submodule_sync
+                --cmake_generator "$(VSGenerator)"
+                --enable_pybind
+                --enable_onnx_tests
+                --parallel --use_binskim_compliant_compile_flags --update
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # building with build.py so the parallelization parameters are added to the msbuild command
+          - task: PythonScript@0
+            displayName: 'Build'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: >
+                --config RelWithDebInfo
+                --build_dir $(Build.BinariesDirectory)
+                --parallel --build
+                $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
+              workingDirectory: '$(Build.BinariesDirectory)'
+
+          # Esrp signing
+          - template: win-esrp-dll.yml
+            parameters:
+              FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
+              DisplayName: 'ESRP - Sign Native dlls'
+              DoEsrp: true
+              Pattern: '*.pyd,*.dll'
+
+          - task: PythonScript@0
+            displayName: 'Build wheel'
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\setup.py'
+              arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
+              workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+
+          - task: CopyFiles@2
+            displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+            inputs:
+              SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
+              Contents: '*.whl'
+              TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+          - task: PublishBuildArtifacts@1
+            displayName: 'Publish Artifact: ONNXRuntime python wheel'
+            inputs:
+              ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
+
+          - script: |
+              7z x *.whl
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+            displayName: 'unzip the package'
+
+          - task: CredScan@3
+            displayName: 'Run CredScan'
+            inputs:
+              debugMode: false
+            continueOnError: true
+
+          - task: BinSkim@4
+            displayName: 'Run BinSkim'
+            inputs:
+              AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
+
+  - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+    dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
+    jobs:
+    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
+      workspace:
+        clean: all
+      pool:
+        name: onnxruntime-Win2022-GPU-T4
+      steps:
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
+        - checkout: self
+          clean: true
+          submodules: none
+
+        - task: UsePythonVersion@0
+          inputs:
+            versionSpec: ${{ parameters.PYTHON_VERSION }}
+            addToPath: true
+            architecture: 'x64'
+
+        - template: flex-downloadPipelineArtifact.yml
           parameters:
-            EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
+            ArtifactName: "onnxruntime_${{ parameters.EP_NAME }}"
+            StepName: 'Download Pipeline Artifact - Windows GPU Build'
+            TargetPath: '$(Build.ArtifactStagingDirectory)'
+            SpecificArtifact: ${{ parameters.SpecificArtifact }}
+            BuildId: ${{ parameters.BuildId }}
+
+        # It could be remove once there's onnx wheel for python 3.12
+        - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}:
+          - template: flex-downloadPipelineArtifact.yml
+            parameters:
+              ArtifactName: "onnx_py12_wheel"
+              StepName: 'Download Pipeline Artifact - Onnx Python12 wheel'
+              TargetPath: '$(Agent.TempDirectory)\onnx\'
+              SpecificArtifact: ${{ parameters.SpecificArtifact }}
+              BuildId: ${{ parameters.BuildId }}
+
+          - powershell: |
+              python -m pip install upgrade pip
+              Get-ChildItem -Path $(Agent.TempDirectory)\onnx\*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+              python -m pip install pytest
+            workingDirectory: '$(Build.SourcesDirectory)'
+            displayName: 'Install ONNX and pytest'
+        - ${{ else }}:
+          - powershell: |
+              pushd onnxruntime/test/python
+              python -m pip install --upgrade pip
+              python -m pip install -r requirements.txt
+              popd
+            workingDirectory: '$(Build.SourcesDirectory)'
+            displayName: 'Install ONNX'
+
+        - powershell: |
+            python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
+
+        - task: TSAUpload@2
+          displayName: 'TSA upload'
+          condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+          inputs:
+            GdnPublishTsaOnboard: false
+            GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
 
-      - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}:
-        - template: jobs/download_win_gpu_library.yml
+        - template: component-governance-component-detection-steps.yml
           parameters:
-            CudaVersion: ${{ parameters.CudaVersion }}
-            ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}:
-              DownloadCUDA: true
-            ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}:
-              DownloadTRT: true
-
-      - task: PythonScript@0
-        displayName: 'Update deps.txt'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-          arguments: --new_dir $(Build.BinariesDirectory)/deps
-          workingDirectory: $(Build.BinariesDirectory)
-
-      - task: PowerShell@2
-        displayName: 'Install ONNX'
-        inputs:
-          filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
-          workingDirectory: '$(Build.BinariesDirectory)'
-          arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo
-
-      - template: set-nightly-build-option-variable-step.yml
-
-
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --skip_submodule_sync
-            --cmake_generator "$(VSGenerator)"
-            --enable_pybind
-            --enable_onnx_tests
-            --parallel --use_binskim_compliant_compile_flags --update
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      # building with build.py so the parallelization parameters are added to the msbuild command
-      - task: PythonScript@0
-        displayName: 'Build'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config RelWithDebInfo
-            --build_dir $(Build.BinariesDirectory)
-            --parallel --build
-            $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }}
-          workingDirectory: '$(Build.BinariesDirectory)'
-
-      # Esrp signing
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd,*.dll'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}'
-          workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - task: PublishBuildArtifacts@1
-        displayName: 'Publish Artifact: ONNXRuntime python wheel'
-        inputs:
-          ArtifactName: onnxruntime_${{ parameters.EP_NAME }}
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
-      - task: CredScan@3
-        displayName: 'Run CredScan'
-        inputs:
-          debugMode: false
-        continueOnError: true
-
-      - task: BinSkim@4
-        displayName: 'Run BinSkim'
-        inputs:
-          AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
-
-      - powershell: |
-         python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq
-         Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-         Remove-Item -Recurse -Force onnxruntime
-         python onnx_backend_test_series.py
-        workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        displayName: 'Run Python Tests'
-
-      - task: TSAUpload@2
-        displayName: 'TSA upload'
-        condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-        inputs:
-          GdnPublishTsaOnboard: false
-          GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
-
-      - template: component-governance-component-detection-steps.yml
-        parameters:
-          condition: 'succeeded'
+            condition: 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index ed32c5d0e15be..b1cdb498bb4ae 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -16,10 +16,10 @@ stages:
     displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-latest"
 
     variables:
-      xcodeVersion: "14.3"
+      xcodeVersion: "14.2"
       ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
 
       ${{ if eq(parameters.packageVariant, 'Mobile') }}:
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index a0ba5ea232ca3..45682c797bbb8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=20.04
 FROM ubuntu:${UBUNTU_VERSION}
 
-ARG OPENVINO_VERSION=2023.0.0
-ARG PYTHON_VERSION=3.8
+ARG OPENVINO_VERSION=2024.0.0
+ARG PYTHON_VERSION=3.9
 
 ADD scripts /tmp/scripts
 RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} -d EdgeDevice && \
@@ -14,15 +14,14 @@ RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
 
 ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}
 ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
+ENV OpenVINO_DIR $INTEL_OPENVINO_DIR/runtime/cmake
 ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN cd /opt && mkdir -p intel && cd intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.0/linux/l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    tar xzf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \
-    mv l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64 openvino_2023.0.0 && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    tar xzf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \
+    mv l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64 openvino_2024.0.0 && \
     cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y
 
 WORKDIR /root
diff --git a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython b/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
deleted file mode 100644
index bc0b412773286..0000000000000
--- a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython
+++ /dev/null
@@ -1,83 +0,0 @@
-FROM quay.io/pypa/manylinux2014_x86_64:latest
-
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
-
-ARG PYTHON_VER_PATH="cp38-cp38"
-ARG PYTHON_VERSION="3.8"
-ARG BUILD_UID=1001
-ARG BUILD_USER=onnxruntimedev
-ARG OV_DEVICE_PRECISION="CPU_FP32"
-ARG ENABLE_TRAINING=true
-ARG ORT_BRANCH="rel-1.13.1"
-ARG OV_VERSION="2022.2.0"
-RUN adduser --uid $BUILD_UID $BUILD_USER
-WORKDIR /home/$BUILD_USER
-ENV PYTHON_EXE="/opt/python/$PYTHON_VER_PATH/bin/python$PYTHON_VERSION"
-
-RUN yum -y install wget git
-
-# libusb1.0.22
-RUN cd /home/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \
-    unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd  /home/libusb-1.0.22 && \
-# bootstrap steps
-    ./bootstrap.sh && \
-    ./configure --disable-udev --enable-shared && \
-    make -j4 && \
-# configure libusb1.0.22
-    cd /home/libusb-1.0.22/libusb && \
-    /bin/mkdir -p '/usr/local/lib' && \
-    /bin/bash ../libtool   --mode=install /usr/bin/install -c   libusb-1.0.la '/usr/local/lib' && \
-    /bin/mkdir -p '/usr/local/include/libusb-1.0' && \
-    /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \
-    /bin/mkdir -p '/usr/local/lib/pkgconfig'
-
-RUN ${PYTHON_EXE} -m pip install onnx numpy wheel
-USER $BUILD_USER
-RUN cd $WORKDIR && git clone https://github.com/openvinotoolkit/openvino.git && \
-    cd openvino && \
-    git checkout $OV_VERSION && \
-    git submodule init && \
-    git submodule update --recursive
-
-RUN cd $WORKDIR && cd openvino && mkdir build && cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_PYTHON=ON -DPYTHON_EXECUTABLE=$PYTHON_EXE -DCMAKE_INSTALL_PREFIX=/home/onnxruntimedev/openvino_$OV_VERSION && \
-    make -j8 && make install
-
-ENV INTEL_OPENVINO_DIR /home/onnxruntimedev/openvino_$OV_VERSION
-ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
-ENV TBB_LIBS $INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib
-ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake
-ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64
-ENV OPENVINO_MANYLINUX 1
-
-RUN cd $WORKDIR && \
-    git clone --recursive -b $ORT_BRANCH https://github.com/intel/onnxruntime.git
-RUN cd onnxruntime/onnxruntime/core/providers/openvino && mkdir scripts
-
-RUN cp ${IE_PLUGINS_PATH}/libopenvino.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_c.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_onnx_frontend.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_cpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_gpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_myriad_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_hetero_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/libopenvino_auto_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/plugins.xml /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${IE_PLUGINS_PATH}/usb-ma2x8x.mvcmd /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbb.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cp ${TBB_LIBS}/libtbbmalloc.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/
-RUN cd /home/onnxruntimedev/onnxruntime && git pull
-RUN if $ENABLE_TRAINING; then \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests --enable_training ; \
-    else \
-        ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \
-        --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \
-        --config Release --build_wheel --skip_tests ;\
-    fi
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index d5139f00e2f04..31c920c6e4438 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -734,7 +734,7 @@ def generate_files(line_list, args):
         )
 
     if args.execution_provider == "openvino":
-        openvino_path = get_env_var("INTEL_OPENVINO_DIR")
+        get_env_var("INTEL_OPENVINO_DIR")
         files_list.append(
             "<file src="
             + '"'
@@ -752,32 +752,6 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
-        if is_windows():
-            dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\")
-            tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\")
-
-            for dll_element in os.listdir(dll_list_path):
-                if dll_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(dll_list_path, dll_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
-            for tbb_element in os.listdir(tbb_list_path):
-                if tbb_element.endswith("dll"):
-                    files_list.append(
-                        "<file src="
-                        + '"'
-                        + os.path.join(tbb_list_path, tbb_element)
-                        + runtimes_target
-                        + args.target_architecture
-                        + '\\native" />'
-                    )
-
     if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build:
         files_list.append(
             "<file src="