diff --git a/cmake/deps.txt b/cmake/deps.txt
index aeb7c05080abb..575df832ac8b0 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -3,10 +3,15 @@
 #The columns are separated by ";" because a list in cmake is just a ";" separated group of strings.
 #Names should be in lower case. They will be used as variable names in cmake.
 #URLs can be either https URLs or local file paths in cmake-style(directory separator is a forward slash character).
-#SHA1 hashes can be generated by running sha1sum command.
+#SHA1 hashes can be generated by running sha1sum command on linux. PowerShell can also be used:
+# (Get-FileHash -Algorithm SHA1 <filename>).Hash.ToLower()
 #If you need to change abseil's version to a different one, you may also want to update external\abseil-cpp.natvis
 #since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would
 #not affect built binaries.
+#
+# NOTE: You must run deps_update_and_upload.py when ready to test your changes in a CI.
+# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
+#
 abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.0.zip;04271dfbfac59269b6939e1e9d5faf0d18a7ba91
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
@@ -18,7 +23,7 @@ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34
 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908
 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
 googletest;https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip;0ac421f2ec11af38b0fff0f1992184032731a8bc
-googlexnnpack;https://github.com/google/XNNPACK/archive/003c580e696a774afdc984996ee909b7c8d8128c.zip;9f192e3f15e1e37ae9c78d53eeea47e45c5eb31c
+googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c
 microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
@@ -35,7 +40,7 @@ protoc_linux_x86;https://github.com/protocolbuffers/protobuf/releases/download/v
 protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-aarch_64.zip;df9d45470b0b8cf939dd2f0ec6b88e9cafc4d617
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
-pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101
+pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip;07a0aa91dd9bf86f31b95497e00f31d8a261a4bd
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip;85da3caa60eb2b148613b443fbc2bfdc30689965
 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake
index 7455584f1a625..e661aa51bfc17 100644
--- a/cmake/external/xnnpack.cmake
+++ b/cmake/external/xnnpack.cmake
@@ -25,17 +25,23 @@ set(FXDIV_SOURCE_DIR ${fxdiv_SOURCE_DIR})
 
 FetchContent_Declare(pthreadpool URL ${DEP_URL_pthreadpool} URL_HASH SHA1=${DEP_SHA1_pthreadpool})
 onnxruntime_fetchcontent_makeavailable(pthreadpool)
-FetchContent_Declare(googlexnnpack URL ${DEP_URL_googlexnnpack}  URL_HASH SHA1=${DEP_SHA1_googlexnnpack}
-PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/xnnpack/AddEmscriptenAndIosSupport.patch)
 
+FetchContent_Declare(googlexnnpack URL ${DEP_URL_googlexnnpack} URL_HASH SHA1=${DEP_SHA1_googlexnnpack}
+                     PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+                    )
 onnxruntime_fetchcontent_makeavailable(googlexnnpack)
 set(XNNPACK_DIR ${googlexnnpack_SOURCE_DIR})
 set(XNNPACK_INCLUDE_DIR ${XNNPACK_DIR}/include)
 
 set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK pthreadpool)
 
+
 # the XNNPACK CMake setup doesn't include the WASM kernels so we have to manually set those up
 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+  # See source lists in _deps/googlexnnpack-src/BUILD.bazel for wasm_prod_microkernels
+  message("Adding WebAssembly Source Files to XNNPACK")
+  set(wasm_srcs "")
+
   file(READ "${XNNPACK_DIR}/BUILD.bazel" xnnpack_bazel_config)
 
   # Replace newlines with semicolon so that it is treated as a list by CMake
@@ -70,25 +76,23 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     set(${target_srcs} ${bazel_srcs} PARENT_SCOPE)
   endfunction()
 
-  GetSrcListFromBazel("PROD_SCALAR_WASM_MICROKERNEL_SRCS" prod_scalar_wasm_srcs)
-  GetSrcListFromBazel("ALL_WASM_MICROKERNEL_SRCS" all_wasm_srcs)
-  GetSrcListFromBazel("WASM32_ASM_MICROKERNEL_SRCS" wasm32_asm_srcs)
+  GetSrcListFromBazel("OPERATOR_SRCS" operator_srcs)
+  GetSrcListFromBazel("TABLE_SRCS" table_srcs)
+  list(APPEND wasm_srcs ${operator_srcs} ${table_srcs})
 
-  message(DEBUG "prod_scalar_wasm_srcs: ${prod_scalar_wasm_srcs}\n")
-  message(DEBUG "all_wasm_srcs: ${all_wasm_srcs}\n")
-  message(DEBUG "wasm32_asm_srcs: ${wasm32_asm_srcs}\n")
+  # kernels
+  list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/scalar.c)
+  list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasm.c)
 
-  message("Adding WebAssembly Source Files to XNNPACK")
-  set(wasm_srcs "")
-  list(APPEND wasm_srcs ${prod_scalar_wasm_srcs})
-  list(APPEND wasm_srcs ${all_wasm_srcs})
-  list(APPEND wasm_srcs ${wasm32_asm_srcs})
+  if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
+    list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c)
+    target_compile_options(XNNPACK PRIVATE "-msimd128")
+  endif()
 
+  message(DEBUG "wasm_srcs: ${wasm_srcs}\n")
   target_sources(XNNPACK PRIVATE ${wasm_srcs})
 
-  if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
-    GetSrcListFromBazel("ALL_WASMSIMD_MICROKERNEL_SRCS" all_wasmsimd_srcs)
-    message(DEBUG "all_wasmsimd_srcs: ${all_wasmsimd_srcs}")
-    target_sources(XNNPACK PRIVATE ${all_wasmsimd_srcs})
-  endif()
+  # add flags from BAZEL.build
+  target_compile_options(XNNPACK PRIVATE "-fno-fast-math")
+  target_compile_options(XNNPACK PRIVATE "-fno-math-errno")
 endif()
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 6ccaf00499e95..9d9b006c595bb 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -282,44 +282,77 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
+  else() # macOS
+    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+
+  # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
-  file(MAKE_DIRECTORY ${STATIC_LIB_DIR})
+  set(STATIC_LIB_TEMP_DIR ${STATIC_LIB_DIR}/temp)
+  add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E rm -rf ${STATIC_LIB_DIR})
+  add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_LIB_DIR})
+  add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_LIB_TEMP_DIR})
 
-  # Remove the existing files in the STATIC_LIB_DIR folder
-  file(GLOB _OLD_STATIC_LIBS ${STATIC_LIB_DIR}/*.a)
-  file(REMOVE "${_OLD_STATIC_LIBS}")
+  set(STATIC_FRAMEWORK_DIR ${STATIC_FRAMEWORK_OUTPUT_DIR}/static_framework/onnxruntime.framework)
+  add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E rm -rf ${STATIC_FRAMEWORK_DIR})
+  add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_FRAMEWORK_DIR})
+
+  # replicate XCode's Single Object Pre-Link
+  # link the internal onnxruntime .o files with the external .a files into a single relocatable object
+  # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used
+  # by the ORT .o files.
 
-  # Go through all the static libraries, and create symbolic links
-  foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} ${onnxruntime_EXTERNAL_LIBRARIES})
+  # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes
+  # with filenames (e.g. utils.o)
+  foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} )
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
-      add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $<TARGET_FILE:${_LIB}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_LIB}>)
+      set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$<TARGET_LINKER_FILE_BASE_NAME:${_LIB}>)
+      add_custom_command(TARGET onnxruntime POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR})
+
+      add_custom_command(TARGET onnxruntime POST_BUILD
+                         COMMAND ar ARGS -x $<TARGET_FILE:${_LIB}>
+                         WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR})
     endif()
   endforeach()
 
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
-  else() # macOS
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  endif()
+  # for external libraries we create a symlink to the .a file
+  foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES})
+    GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
+    if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
+      add_custom_command(TARGET onnxruntime POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E create_symlink
+                           $<TARGET_FILE:${_LIB}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_LIB}>)
+    endif()
+  endforeach()
 
-  # Assemble the static framework
-  set(STATIC_FRAMEWORK_DIR ${STATIC_FRAMEWORK_OUTPUT_DIR}/static_framework/onnxruntime.framework)
-  set(STATIC_FRAMEWORK_HEADER_DIR ${STATIC_FRAMEWORK_DIR}/Headers)
-  file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_DIR})
-  # Remove all files under STATIC_FRAMEWORK_DIR (if any)
-  file(GLOB_RECURSE _OLD_STATIC_FRAMEWORK ${STATIC_FRAMEWORK_DIR}/*.*)
-  file(REMOVE "${_OLD_STATIC_FRAMEWORK}")
+  # do the pre-link with `ld -r` to create a single relocatable object with correct symbol visibility
+  add_custom_command(TARGET onnxruntime POST_BUILD
+                     COMMAND ld ARGS -r -o ${STATIC_LIB_DIR}/prelinked_objects.o */*.o ../*.a
+                     WORKING_DIRECTORY ${STATIC_LIB_TEMP_DIR})
+
+  # create the static library
+  add_custom_command(TARGET onnxruntime POST_BUILD
+                     COMMAND libtool -static -o ${STATIC_FRAMEWORK_DIR}/onnxruntime prelinked_objects.o
+                     WORKING_DIRECTORY ${STATIC_LIB_DIR})
 
+  # Assemble the other pieces of the static framework
+  add_custom_command(TARGET onnxruntime POST_BUILD
+                     COMMAND ${CMAKE_COMMAND} -E
+                       copy_if_different ${INFO_PLIST_PATH} ${STATIC_FRAMEWORK_DIR}/Info.plist)
+
+  # add the framework header files
+  set(STATIC_FRAMEWORK_HEADER_DIR ${STATIC_FRAMEWORK_DIR}/Headers)
   file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_HEADER_DIR})
 
-  # copy the header files one by one, and the Info.plist
   foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS})
     get_filename_component(HEADER_NAME_ ${h_} NAME)
-    add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_})
+    add_custom_command(TARGET onnxruntime POST_BUILD
+                       COMMAND ${CMAKE_COMMAND} -E
+                         copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_})
   endforeach()
-  add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${INFO_PLIST_PATH} ${STATIC_FRAMEWORK_DIR}/Info.plist)
 
-  # link the static library
-  add_custom_command(TARGET onnxruntime POST_BUILD COMMAND libtool -static -o ${STATIC_FRAMEWORK_DIR}/onnxruntime *.a WORKING_DIRECTORY ${STATIC_LIB_DIR})
 endif()
diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake
index 735c86956ec4f..3f532ec2c3261 100644
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@@ -20,6 +20,8 @@ if (onnxruntime_MINIMAL_BUILD)
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_deprecated_operators.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.h"
     "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc"
+    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.h"
+    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.cc"
     "${ONNXRUNTIME_ROOT}/core/graph/function_template.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.h"
     "${ONNXRUNTIME_ROOT}/core/graph/function_utils.cc"
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 6af6fed69f051..686a993de3a4a 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -57,12 +57,14 @@
       URL ${DEP_URL_onnx_tensorrt}
       URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
     )
+    if (NOT CUDA_INCLUDE_DIR)
+      set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
+    endif()
     # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
     # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
     onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
     include_directories(${onnx_tensorrt_SOURCE_DIR})
     set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-    set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build
     if ( CMAKE_COMPILER_IS_GNUCC )
       set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
     endif()
diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake
index 30ae90e6564cc..9c00703ca0846 100644
--- a/cmake/onnxruntime_providers_xnnpack.cmake
+++ b/cmake/onnxruntime_providers_xnnpack.cmake
@@ -15,7 +15,8 @@
   source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_xnnpack_cc_srcs})
   onnxruntime_add_static_library(onnxruntime_providers_xnnpack ${onnxruntime_providers_xnnpack_cc_srcs})
   onnxruntime_add_include_to_target(onnxruntime_providers_xnnpack
-    onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} XNNPACK pthreadpool flatbuffers::flatbuffers Boost::mp11 safeint_interface
+    onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} XNNPACK pthreadpool
+    flatbuffers::flatbuffers Boost::mp11 safeint_interface
   )
 
   add_dependencies(onnxruntime_providers_xnnpack onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
@@ -35,4 +36,4 @@
   # there are some in builds where sizeof(size_t) != sizeof(int64_t), e.g., in 'ONNX Runtime Web CI Pipeline'
   if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     target_compile_options(onnxruntime_providers_xnnpack PRIVATE -Wno-error=shorten-64-to-32)
-  endif()
\ No newline at end of file
+  endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index f5f98066675fb..bdb0230a8ebd0 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -41,7 +41,7 @@ function(AddTest)
   if (MSVC)
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd6330>"
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd6330>")
-    #Abseil has a lot of C4127/C4324 warnings. 
+    #Abseil has a lot of C4127/C4324 warnings.
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4127>"
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4127>")
     target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4324>"
@@ -201,8 +201,18 @@ function(AddTest)
           list(APPEND TEST_NODE_FLAGS "--experimental-wasm-simd")
         endif()
 
+        # prefer Node from emsdk so the version is more deterministic
+        if (DEFINED ENV{EMSDK_NODE})
+          set(NODE_EXECUTABLE $ENV{EMSDK_NODE})
+        else()
+          # warning as we don't know what node version is being used and whether things like the TEST_NODE_FLAGS
+          # will be valid. e.g. "--experimental-wasm-simd" is not valid with node v20 or later.
+          message(WARNING "EMSDK_NODE environment variable was not set. Falling back to system `node`.")
+          set(NODE_EXECUTABLE node)
+        endif()
+
         add_test(NAME ${_UT_TARGET}
-          COMMAND node ${TEST_NODE_FLAGS} ${_UT_TARGET}.js ${TEST_ARGS}
+          COMMAND ${NODE_EXECUTABLE} ${TEST_NODE_FLAGS} ${_UT_TARGET}.js ${TEST_ARGS}
           WORKING_DIRECTORY $<TARGET_FILE_DIR:${_UT_TARGET}>
         )
       endif()
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index c6510c97a617e..9014089cb6112 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -192,8 +192,13 @@ else()
     onnxruntime_util
     re2::re2
   )
+
+  set(EXPORTED_RUNTIME_METHODS "'stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8'")
+
   if (onnxruntime_USE_XNNPACK)
     target_link_libraries(onnxruntime_webassembly PRIVATE XNNPACK)
+    string(APPEND EXPORTED_RUNTIME_METHODS ",'addFunction'")
+    target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s ALLOW_TABLE_GROWTH=1")
   endif()
 
   if(onnxruntime_USE_WEBNN)
@@ -204,7 +209,6 @@ else()
     target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard)
   endif()
 
-  set(EXPORTED_RUNTIME_METHODS "['stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8']")
   if (onnxruntime_USE_JSEP)
     set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName")
   else()
@@ -212,7 +216,7 @@ else()
   endif()
 
   target_link_options(onnxruntime_webassembly PRIVATE
-    "SHELL:-s EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}"
+    "SHELL:-s EXPORTED_RUNTIME_METHODS=[${EXPORTED_RUNTIME_METHODS}]"
     "SHELL:-s EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}"
     "SHELL:-s MAXIMUM_MEMORY=4294967296"
     "SHELL:-s EXIT_RUNTIME=0"
diff --git a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
index 37bdbf9fb53f6..460b4d97c499b 100644
--- a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+++ b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
@@ -1,66 +1,27 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d53c48aa1..77c3cf983 100755
+index dba9b4687..bcaa18ad7 100755
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -105,22 +105,12 @@ ENDIF()
- 
+@@ -122,7 +122,7 @@ ENDIF()
+ # ---[ Build flags
  IF(NOT CMAKE_SYSTEM_NAME)
    MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined")
--ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Darwin|Linux|Android|Windows|CYGWIN|MSYS)$")
-+ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Darwin|Linux|Android|Windows|CYGWIN|MSYS|Emscripten|iOS)$")
-   MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME = ${CMAKE_SYSTEM_NAME}")
+-ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Android|Darwin|iOS|Linux|Windows|CYGWIN|MSYS|QURT)$")
++ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Android|Darwin|iOS|Linux|Windows|CYGWIN|MSYS|QURT|Emscripten|iOS)$")
+   MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME value \"${CMAKE_SYSTEM_NAME}\"")
  ENDIF()
- 
- # ---[ Download deps
- IF(NOT XNNPACK_USE_SYSTEM_LIBS)
--  IF(NOT DEFINED CLOG_SOURCE_DIR)
--    MESSAGE(STATUS "Downloading clog to ${CMAKE_BINARY_DIR}/clog-source (define CLOG_SOURCE_DIR to avoid it)")
--    CONFIGURE_FILE(cmake/DownloadCLog.cmake "${CMAKE_BINARY_DIR}/clog-download/CMakeLists.txt")
--    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
--      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
--    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build .
--      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
--    SET(CLOG_SOURCE_DIR "${CMAKE_BINARY_DIR}/clog-source" CACHE STRING "clog source directory")
--  ENDIF()
--
-   IF(NOT DEFINED CPUINFO_SOURCE_DIR)
-     MESSAGE(STATUS "Downloading cpuinfo to ${CMAKE_BINARY_DIR}/cpuinfo-source (define CPUINFO_SOURCE_DIR to avoid it)")
-     CONFIGURE_FILE(cmake/DownloadCpuinfo.cmake "${CMAKE_BINARY_DIR}/cpuinfo-download/CMakeLists.txt")
-@@ -7108,6 +7098,10 @@ IF(MSVC)
-   SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
-   SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
-   SET_PROPERTY(SOURCE ${COLD_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O1 >")
-+ELSEIF(CMAKE_GENERATOR STREQUAL Xcode)
-+  TARGET_COMPILE_OPTIONS(all_microkernels PRIVATE $<$<NOT:$<CONFIG:Debug>>: -O2 >)
-+  TARGET_COMPILE_OPTIONS(XNNPACK PRIVATE $<$<NOT:$<CONFIG:Debug>>: -O2 >)
-+  TARGET_COMPILE_OPTIONS(XNNPACK PRIVATE $<$<NOT:$<CONFIG:Debug>>: -Os >)
- ELSE()
-   SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
-   SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
-@@ -7142,26 +7136,6 @@ IF(LIBM)
-   TARGET_LINK_LIBRARIES(indirection PRIVATE ${LIBM})
+ IF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+@@ -534,7 +534,12 @@ IF(XNNPACK_BUILD_LIBRARY)
+   TARGET_LINK_LIBRARIES(operator-utils PRIVATE logging)
+   TARGET_LINK_LIBRARIES(post-operation PRIVATE logging)
+   TARGET_LINK_LIBRARIES(subgraph PRIVATE allocator logging memory mutex operators operator-run)
+-  TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph)
++  IF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
++    # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake
++    TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation subgraph)
++  ELSE()
++    TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph)
++  ENDIF()
+   SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES)
  ENDIF()
- 
--# ---[ Configure clog
--IF(NOT TARGET clog)
--  IF(NOT XNNPACK_USE_SYSTEM_LIBS)
--    SET(CLOG_BUILD_TESTS OFF CACHE BOOL "")
--    SET(CLOG_RUNTIME_TYPE "${CPUINFO_RUNTIME_TYPE}" CACHE STRING "")
--    ADD_SUBDIRECTORY(
--      "${CLOG_SOURCE_DIR}/deps/clog"
--      "${CMAKE_BINARY_DIR}/clog")
--    # We build static version of clog but a dynamic library may indirectly depend on it
--    SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
--  ELSE()
--    ADD_LIBRARY(clog STATIC IMPORTED)
--    FIND_LIBRARY(CLOG_LIBRARY clog)
--    IF(NOT CLOG_LIBRARY)
--      MESSAGE(FATAL_ERROR "Cannot find clog")
--    ENDIF()
--    SET_PROPERTY(TARGET clog PROPERTY IMPORTED_LOCATION "${CLOG_LIBRARY}")
--  ENDIF()
--ENDIF()
--
- # ---[ Configure cpuinfo
- IF(NOT TARGET cpuinfo)
-   IF(NOT XNNPACK_USE_SYSTEM_LIBS)
+ IF(NOT MSVC)
diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj
index 0288d752d8749..69bfd9896f1e4 100644
--- a/csharp/OnnxRuntime.CSharp.proj
+++ b/csharp/OnnxRuntime.CSharp.proj
@@ -17,9 +17,13 @@ CMake creates a target to this project
     <TargetArchitecture Condition=" '$(TargetArchitecture)' == '' ">x64</TargetArchitecture>
     <IsReleaseBuild Condition=" '$(IsReleaseBuild)' == '' ">false</IsReleaseBuild>
     <ReleaseVersionSuffix></ReleaseVersionSuffix>
-    <IsLinuxBuild Condition=" '$(IsLinuxBuild)' == '' ">false</IsLinuxBuild>
+    <IsWindowsBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::Windows)))' == 'true'">true</IsWindowsBuild>
+    <IsLinuxBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::Linux)))' == 'true'">true</IsLinuxBuild>
     <ExecutionProvider Condition=" '$(ExecutionProvider)' == '' ">None</ExecutionProvider>
 
+    <!-- include Xamarin/MAUI Android and iOS target frameworks? Command line property overrides this value. -->
+    <IncludeMobileTargets>true</IncludeMobileTargets>
+
     <!--internal build related properties-->
     <OnnxRuntimeSourceDirectory Condition="'$(OnnxRuntimeSourceDirectory)'==''">..</OnnxRuntimeSourceDirectory>
     <GenerateNuspecScript>..\tools\nuget\generate_nuspec_for_native_nuget.py</GenerateNuspecScript>
@@ -30,13 +34,15 @@ CMake creates a target to this project
     <OnnxRuntimeBuildDirectory Condition="'$(OnnxRuntimeBuildDirectory)'==''">..\build\Linux</OnnxRuntimeBuildDirectory>
     <OnnxRuntimePackagesDirectory Condition="'$(OnnxRuntimePackagesDirectory)'==''">$(OnnxRuntimeBuildDirectory)\packages</OnnxRuntimePackagesDirectory>
     <NativeBuildOutputDir>$(OnnxRuntimeBuildDirectory)\$(Configuration)</NativeBuildOutputDir>
+    <PythonExe>python3</PythonExe>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(IsLinuxBuild)'=='false'">
+  <PropertyGroup Condition="'$(IsWindowsBuild)'=='true'">
     <!--internal build related properties for Windows -->
     <OnnxRuntimeBuildDirectory Condition="'$(OnnxRuntimeBuildDirectory)'==''">..\build\Windows</OnnxRuntimeBuildDirectory>
     <OnnxRuntimePackagesDirectory Condition="'$(OnnxRuntimePackagesDirectory)'==''">$(OnnxRuntimeBuildDirectory)\packages</OnnxRuntimePackagesDirectory>
     <NativeBuildOutputDir>$(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration)</NativeBuildOutputDir>
+    <PythonExe>python</PythonExe>
   </PropertyGroup>
 
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
@@ -86,28 +92,48 @@ CMake creates a target to this project
   </Target>
 
   <Target Name="CreatePackage">
-    <Message Importance="High" Text="Bundling managed assemblies into a NuGet package ..." />
+    <!-- Create Microsoft.ML.OnnxRuntime.Managed with the C# bindings using the C# project -->
+    <Message Importance="High" Text="Creating Microsoft.ML.OnnxRuntime.Managed nuget package..." />
     <MSBuild Projects="src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj"
              Targets="CopyMiscFiles;Pack"
-            Properties="NoBuild=true;Platform=AnyCPU;PackageVersion=$(PackageVersion);OrtPackageId=$(OrtPackageId);SelectedTargets=All"/>
+            Properties="NoBuild=true;Platform=AnyCPU;PackageVersion=$(PackageVersion);OrtPackageId=$(OrtPackageId);IncludeMobileTargets=$(IncludeMobileTargets)"/>
 
-    <Message Importance="High" Text="Generating nuspec for the native Nuget package ..." />
-    <Exec ContinueOnError="False" Command="python $(GenerateNuspecScript) --package_version $(PackageVersion) --package_name $(OrtPackageId) --target_architecture $(TargetArchitecture) --build_config $(Configuration) --native_build_path $(NativeBuildOutputDirAbs) --packages_path $(OnnxRuntimePackagesDirectoryAbs) --ort_build_path $(OnnxRuntimeBuildDirectoryAbs) --sources_path $(OnnxRuntimeSourceDirectoryAbs) --commit_id $(GitCommitHash) --is_release_build $(IsReleaseBuild) --execution_provider $(ExecutionProvider)" ConsoleToMSBuild="true">
+    <!-- Manually create the nuspec for the native Microsoft.ML.OnnxRuntime package -->
+    <Message Importance="High" Text="Generating nuspec for the native Microsoft.ML.OnnxRuntime nuget package..." />
+    <Exec Command="$(PythonExe) $(GenerateNuspecScript) --package_version $(PackageVersion) --package_name $(OrtPackageId) --target_architecture $(TargetArchitecture) --build_config $(Configuration) --native_build_path $(NativeBuildOutputDirAbs) --packages_path $(OnnxRuntimePackagesDirectoryAbs) --ort_build_path $(OnnxRuntimeBuildDirectoryAbs) --sources_path $(OnnxRuntimeSourceDirectoryAbs) --commit_id $(GitCommitHash) --is_release_build $(IsReleaseBuild) --execution_provider $(ExecutionProvider)" 
+          ContinueOnError="False" 
+          ConsoleToMSBuild="true">
         <Output TaskParameter="ConsoleOutput" PropertyName="GenerateNuspecOutput" />
     </Exec>
 
-    <Message Importance="High" Text="Bundling native shared library artifacts into a NuGet package ..." />
-    <Exec ContinueOnError="False" Command="$(NugetExe) pack NativeNuget.nuspec" ConsoleToMSBuild="true" WorkingDirectory="$(NativeBuildOutputDirAbs)" Condition=" '$(OS)' == 'Windows_NT'">
+    <!-- run `nuget pack` on Windows or `dotnet pack` on Linux to create the native nupkg -->
+    <Message Importance="High" Text="Bundling native shared library artifacts into Microsoft.ML.OnnxRuntime nuget package..." />
+    <Exec Condition=" '$(IsWindowsBuild)' == 'true'"
+          Command="$(NugetExe) pack NativeNuget.nuspec" 
+          WorkingDirectory="$(NativeBuildOutputDirAbs)"
+          ContinueOnError="False" 
+          ConsoleToMSBuild="true">
       <Output TaskParameter="ConsoleOutput" PropertyName="OutputOfExec" />
     </Exec>
 
-    <Exec ContinueOnError="False" Command="$(NugetExe) pack NativeNuget.nuspec" ConsoleToMSBuild="true" WorkingDirectory="$(NativeBuildOutputDirAbs)" Condition=" '$(OS)' != 'Windows_NT'">
+    <!-- build.py uses dotnet to build on linux so we know it's available.
+         nuget needs to be run using mono to work correctly, but installing mono on WSL breaks interop
+         (see https://github.com/microsoft/WSL/issues/8531). in order to play nicely with both we use `dotnet pack`
+         to pack the native nuget package using a stub csproj to provide the nuspec file path. 
+    -->
+    <Exec Condition="'$(IsLinuxBuild)' == 'true'"
+          Command="dotnet pack tools/linux_pack/LinuxPackNativeNuget.csproj /p:Configuration=$(Configuration) /p:OnnxRuntimeBuildDirectory=$(NativeBuildOutputDirAbs)" 
+          ContinueOnError="False" 
+          ConsoleToMSBuild="true" >
       <Output TaskParameter="ConsoleOutput" PropertyName="OutputOfExec" />
     </Exec>
+    <!-- copy the nupkg to the build output directory so its location is consistent on all platforms -->
+    <Copy Condition="'$(IsLinuxBuild)' == 'true'"
+          SourceFiles="tools/linux_pack/bin/$(Configuration)/$(OrtPackageId).$(PackageVersion).nupkg"
+          DestinationFolder="$(NativeBuildOutputDirAbs)" />
 
-    <Copy
-    SourceFiles="$(NativeBuildOutputDirAbs)\$(OrtPackageId).$(PackageVersion).nupkg"
-    DestinationFolder="$(NativeBuildOutputDirAbs)\nuget-local-artifacts"
+    <Copy SourceFiles="$(NativeBuildOutputDirAbs)\$(OrtPackageId).$(PackageVersion).nupkg"
+          DestinationFolder="$(NativeBuildOutputDirAbs)\nuget-local-artifacts"
     />
   </Target>
 
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 29ccf55f081d5..0c74a23204d4f 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -4,66 +4,53 @@
     <OrtPackageId Condition="'$(OrtPackageId)' == ''">Microsoft.ML.OnnxRuntime</OrtPackageId>
   </PropertyGroup>
 
-  <!--
-  Temporary setup until official Visual Studio 2022 release supports .net6, as the CIs require the official release.
-  We will be able to build all targets with VS once that happens.
-
-  Until then, we need to build the pre-.net6 targets with VS and the .net6 targets with dotnet.
-
-  The pre-.net6 Xamarin targets are optional and only included if the machine has the required workloads.
-
-  We have 3 scenarios
-    1) Build pre-net6 targets with VS - SelectedTargets=PreNet6
-    2) Build net6 targets - SelectedTargets=Net6
-    3) Run 'Pack' target to create nuget package from combination of 1 and 2 - SelectedTargets=All
-
-  Default is PreNet6 so that existing projects and CI builds will do the same thing unless explicitly updated.
-  -->
   <PropertyGroup>
-    <SelectedTargets>PreNet6</SelectedTargets>
-    <BaseTargets>netstandard2.0;netcoreapp3.1;net6.0</BaseTargets>
+    <IncludeMobileTargets>true</IncludeMobileTargets>
+    <BaseTargets>netstandard2.0</BaseTargets>
+    <MobileTargets></MobileTargets>
   </PropertyGroup>
 
-  <!-- only set the Xamarin mobile targets if we're building an ORT package,
-       and only if the mobile workloads are installed -->
-  <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR
-                             '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Gpu') AND
-                             Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND
-                             Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')">
-    <XamarinTargets>xamarinios10;monoandroid11.0</XamarinTargets>
+  <!-- special case the DesktopOnly solution -->
+  <PropertyGroup Condition="'$(SolutionName)' == 'OnnxRuntime.DesktopOnly.CSharp'">
+    <IncludeMobileTargets>false</IncludeMobileTargets>
   </PropertyGroup>
 
-  <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training') AND
-                             Exists('$(MSBuildExtensionsPath)\Xamarin\Android')">
-    <XamarinTargetsForTraining>monoandroid11.0</XamarinTargetsForTraining>
-  </PropertyGroup>
+  <!-- add Xamarin mobile targets if we're building an ORT package and the Xamarin workloads are installed
 
-  <!-- only set the .net6 targets if we're building an ORT package.
-       we can add .net6 support to other packages later as needed -->
+       NOTE: We include in a build of the managed package when creating Microsoft.ML.OnnxRuntime.Gpu as both
+             the CPU and GPU packaging pipelines can publish Microsoft.ML.OnnxRuntime.Managed, and we need the targets
+             to be consistent in both.
+  -->
   <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR
-                             '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Azure' OR
-                             '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Gpu')">
-    <Net6Targets>net6.0;net6.0-android;net6.0-ios;net6.0-macos</Net6Targets>
+                             '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Gpu') AND
+                            '$(IncludeMobileTargets)' == 'true' AND
+                            Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND
+                            Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')">
+    <MobileTargets>xamarinios10;monoandroid11.0</MobileTargets>
   </PropertyGroup>
 
-  <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training')">
-    <Net6TargetsForTrainingPackage>net6.0;net6.0-android</Net6TargetsForTrainingPackage>
+  <PropertyGroup Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training' AND
+                            '$(IncludeMobileTargets)' == 'true' AND
+                            Exists('$(MSBuildExtensionsPath)\Xamarin\Android')">
+    <MobileTargets>monoandroid11.0</MobileTargets>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(SelectedTargets)'=='PreNet6'">
-    <TargetFrameworks>$(BaseTargets);$(XamarinTargets);$(XamarinTargetsForTraining)</TargetFrameworks>
+  <!-- add MAUI targets if building ORT package -->
+  <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR
+                             '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Azure') AND
+                            '$(IncludeMobileTargets)' == 'true'">
+    <MobileTargets>$(MobileTargets);net6.0-android;net6.0-ios</MobileTargets>
   </PropertyGroup>
 
-  <PropertyGroup Condition="'$(SelectedTargets)'=='Net6'">
-    <TargetFrameworks>$(Net6Targets);$(Net6TargetsForTrainingPackage)</TargetFrameworks>
+  <PropertyGroup Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training' AND
+                            '$(IncludeMobileTargets)' == 'true'">
+    <MobileTargets>$(MobileTargets);net6.0-android</MobileTargets>
   </PropertyGroup>
 
-  <!-- nuget package creation -->
-  <PropertyGroup Condition="'$(SelectedTargets)'=='All'">
-    <TargetFrameworks>$(BaseTargets);$(XamarinTargets);$(XamarinTargetsForTraining);$(Net6Targets);$(Net6TargetsForTrainingPackage)</TargetFrameworks>
+  <PropertyGroup>
+    <TargetFrameworks>$(BaseTargets);$(MobileTargets)</TargetFrameworks>
   </PropertyGroup>
 
-
   <PropertyGroup>
     <Platforms>AnyCPU;x86</Platforms>
     <LangVersion>default</LangVersion>
@@ -204,8 +191,9 @@
     <DefineConstants>$(DefineConstants);$(OrtConstants)</DefineConstants>
   </PropertyGroup>
 
-  <!-- debug output - makes finding/fixing any issues with the the conditions easy.
+  <!-- debug output - makes finding/fixing any issues with the the conditions easy.  -->
   <Target Name="DumpValues" BeforeTargets="PreBuildEvent">
+    <Message Text="SolutionName='$(SolutionName)'" />
     <Message Text="TargetPlatform='$(TargetPlatform)' TargetPlatformIdentifier='$(TargetPlatformIdentifier)' " />
     <Message Text="TargetFramework='$(TargetFramework)' TargetFrameworkIdentifier='$(TargetFrameworkIdentifier)' " />
     <Message Text="[MSBuild]::GetTargetPlatformIdentifier(TargetFramework)='$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)'))' " />
@@ -214,7 +202,6 @@
     <Message Text="OrtConstants='$(OrtConstants)' " />
     <Message Text="TargetFrameworks='$(TargetFrameworks)' " />
   </Target>
-  -->
 
   <ItemGroup>
     <None Include="$(OnnxRuntimeCsharpRoot)\..\include\onnxruntime\core\session\onnxruntime_*.h"
diff --git a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
new file mode 100644
index 0000000000000..098078d2e3683
--- /dev/null
+++ b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
@@ -0,0 +1,15 @@
+<!-- csproj for use with `dotnet pack` on linux via build.py's `build_nuget` option.
+
+     Expected usage is to create a Microsoft.ML.OnnxRuntime native nuget package containing only the linux
+     libonnxruntime.so for local testing.
+     We only enable netstandard2.0 due to that, but additional frameworks can be added as needed.
+
+     If you need a more sophisticated package for testing, you can run the production packaging pipeline against your
+     branch and download the resulting nuget package from the build artifacts.
+ -->
+<Project Sdk="MSBuild.Sdk.Extras/3.0.22">
+  <PropertyGroup>
+    <TargetFrameworks>netstandard2.0</TargetFrameworks>
+    <NuspecFile>$(OnnxRuntimeBuildDirectory)/NativeNuget.nuspec</NuspecFile>
+  </PropertyGroup>
+</Project>
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 5b94a4a510934..0b82a9c031baa 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -20,15 +20,15 @@ Do not modify directly.*
 | Asinh | ai.onnx(9+) |  |
 | Atan | ai.onnx(7+) |  |
 | Atanh | ai.onnx(9+) |  |
-| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(11+) | need perf optimization; need implementing activation |
+| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(7-9,10,11+) | need perf optimization; need implementing activation |
 | BiasAdd | com.microsoft(1+) |  |
 | BiasSplitGelu | com.microsoft(1+) |  |
 | Cast | ai.onnx(6-8,9-12,13-18,19+) |  |
 | Ceil | ai.onnx(6-12,13+) |  |
 | Clip | ai.onnx(6-10,11,12,13+) |  |
 | Concat | ai.onnx(1-3,4-10,11-12,13+) |  |
-| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; conv3d is not supported; need implementing activation |
-| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation |
+| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; conv3d is not supported; need implementing activation |
+| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation |
 | Cos | ai.onnx(7+) |  |
 | Cosh | ai.onnx(9+) |  |
 | Div | ai.onnx(7-12,13,14+) |  |
@@ -57,7 +57,7 @@ Do not modify directly.*
 | LessOrEqual | ai.onnx(12-15,16+) |  |
 | Log | ai.onnx(6-12,13+) |  |
 | MatMul | ai.onnx(1-12,13+) |  |
-| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(11,12+) | need perf optimization; need implementing activation |
+| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(1-7,8-9,10,11,12+) | need perf optimization; need implementing activation |
 | MemcpyFromHost | ai.onnx(1+) |  |
 | MemcpyToHost | ai.onnx(1+) |  |
 | Mul | ai.onnx(7-12,13,14+) |  |
@@ -79,7 +79,7 @@ Do not modify directly.*
 | ReduceSumSquare | ai.onnx(1-10,11-12,13-17,18+) |  |
 | Relu | ai.onnx(6-12,13,14+) |  |
 | Reshape | ai.onnx(5-12,13,14+) | no GPU kernel |
-| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
+| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
 | Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix |
 | Sigmoid | ai.onnx(6-12,13+) |  |
 | Sin | ai.onnx(7+) |  |
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index bead3e72f63c7..4238449f9246f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -29,12 +29,12 @@ const createElementwiseProgramShader =
       const output = outputVariable('outputData', outputDataType, [vecSize], 4);
 
       return `
-  ${shaderHelper.declareVariables(input, output)}
+      ${shaderHelper.registerUniform('vec_size', 'u32').declareVariables(input, output)}
 
   ${additionalImplementation ?? ''}
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(vecSize)}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.vec_size')}
 
     let a = ${input.getByOffset('global_idx')};
     ${output.setByOffset('global_idx', expression)}
@@ -45,13 +45,16 @@ const createElementwiseProgramInfo =
     (input: TensorView, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string,
      cacheKey?: string, outputDataType: number = input.dataType): ProgramInfo => ({
       name,
-      shaderCache: {hint: cacheKey},
+      shaderCache: {hint: cacheKey, inputDependencies: ['type']},
       getShaderSource: shaderHelper => createElementwiseProgramShader(
           shaderHelper, ShapeUtil.size(input.dims), input.dataType, outputDataType, funcCall, additionalImplementation),
       getRunData: (inputTensors) => ({
         outputs: [{dims: input.dims, dataType: outputDataType}],
         dispatchGroup:
-            {x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)}
+            {x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)},
+        programUniforms: [
+          {type: 'uint32', data: Math.ceil(ShapeUtil.size(input.dims) / 4)},
+        ],
       })
     });
 
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 628e5408150f8..29acc07e118f9 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -164,7 +164,10 @@ async function initializeSession(
       session = await ort.InferenceSession.create(modelFilePath, sessionConfig);
     }
   } catch (e) {
-    Logger.error('TestRunner', `Failed to load model from file: ${modelFilePath}. Error: ${inspect(e)}`);
+    Logger.error(
+        'TestRunner',
+        `Failed to load model from file: ${modelFilePath}. ` +
+            `Error: ${e.message} @ ${e.fileName}:${e.lineNumber}`);
     throw e;
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h b/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h
deleted file mode 100644
index 11b5447d65ed2..0000000000000
--- a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <cstdint>
-#include <algorithm>
-#include <cmath>
-
-namespace onnxruntime {
-namespace contrib {
-
-#if defined(_MSC_VER)
-#define FORCEINLINE __forceinline
-#else
-#define FORCEINLINE __attribute__((always_inline)) inline
-#endif
-
-template <typename T, int32_t block_size, int32_t bits>
-struct alignas(1) BlockwiseQuantBlock {
-  static_assert(block_size % 8 == 0);
-
-  uint8_t blob_data[block_size / 8 * bits];
-
-  FORCEINLINE void dequant(T* dst, T scale, int32_t k_idx, int32_t K) const;
-  FORCEINLINE void dequant(T* dst, T scale, uint8_t zp, int32_t k_idx, int32_t K) const;
-
-  FORCEINLINE void quant(const T* src, T& scale, int32_t k_idx, int32_t K, int32_t N);
-  FORCEINLINE void quant(const T* src, T& scale, uint8_t& zp, int32_t k_idx, int32_t K, int32_t N);
-};
-
-template <typename T, int32_t block_size>
-struct alignas(1) BlockwiseQuantBlock<T, block_size, 4> {
-  static_assert(block_size % 8 == 0);
-
-  uint8_t blob_data[block_size / 2];
-
-  FORCEINLINE void dequant(T* dst, T scale, uint8_t zp, int32_t k_idx, int32_t K) const {
-    for (int i = 0; i < block_size; i += 2) {
-      T zp_t = static_cast<T>(float(zp));
-      if (k_idx + i < K) {
-        T x0 = static_cast<T>(float(blob_data[i / 2] & 0xF));
-        dst[i] = scale * (x0 - zp_t);
-      }
-      if (k_idx + i + 1 < K) {
-        T x1 = static_cast<T>(float(blob_data[i / 2] >> 4));
-        dst[i + 1] = scale * (x1 - zp_t);
-      }
-    }
-  }
-
-  FORCEINLINE void dequant(T* dst, T scale, int32_t k_idx, int32_t K) const {
-    constexpr uint8_t zp = 8;
-    dequant(dst, scale, zp, k_idx, K);
-  }
-
-  FORCEINLINE void quant(const T* src, T& scale_block, uint8_t& zp, int32_t k_idx, int32_t K, int32_t N) {
-    float min = static_cast<float>(*src);
-    float max = static_cast<float>(*src);
-    int32_t klen = std::min(block_size, K - k_idx);
-    for (int32_t kk = 0; kk < klen; kk++) {
-      const float v = static_cast<float>(src[N * kk]);
-      if (v < min) min = v;
-      if (v > max) max = v;
-    }
-    min = std::min(min, 0.0f);
-    max = std::max(max, 0.0f);
-
-    const float scale = (max - min) / ((1 << 4) - 1);
-    scale_block = static_cast<T>(scale);
-
-    const float reciprocal_scale = scale ? 1.0f / scale : 0.0f;
-    float zero_point_fp = min;
-    if (scale != 0.0f) {
-      zero_point_fp = 0.f - min / scale;
-    }
-
-    // Handle any clamping
-    if (zero_point_fp < 0.0f) {
-      zp = 0;
-    } else if (zero_point_fp > 15.0f) {
-      zp = 15;
-    } else {
-      zp = (uint8_t)roundf(zero_point_fp);
-    }
-
-    for (int32_t kk = 0; kk < klen; kk += 2) {
-      const float v0 = static_cast<float>(src[N * kk]);
-      const uint8_t vi0 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v0 * reciprocal_scale + zp)));
-
-      const float v1 = static_cast<float>((kk + 1 < klen) ? src[N * (kk + 1)] : 0.f);
-      const uint8_t vi1 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v1 * reciprocal_scale + zp)));
-
-      blob_data[kk / 2] = vi0 | (vi1 << 4);
-    }
-  }
-
-  FORCEINLINE void quant(const T* src, T& scale_block, int32_t k_idx, int32_t K, int32_t N) {
-    float amax = 0.0f;  // abs(max)
-    float max = 0.0f;
-
-    int32_t klen = std::min(block_size, K - k_idx);
-
-    for (int32_t kk = 0; kk < klen; kk++) {
-      const float v = static_cast<float>(src[N * kk]);
-      if (amax < fabsf(v)) {
-        amax = fabsf(v);
-        max = v;
-      }
-    }
-
-    const float scale = max / (-8.f);
-    scale_block = static_cast<T>(scale);
-    const float reciprocal_scale = scale ? 1.0f / scale : 0.0f;
-
-    for (int32_t kk = 0; kk < klen; kk += 2) {
-      const float v0 = src[N * kk] * reciprocal_scale;
-      const uint8_t vi0 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v0 + 8.f)));
-
-      const float v1 = (kk + 1 < klen) ? src[N * (kk + 1)] * reciprocal_scale : 0;
-      const uint8_t vi1 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v1 + 8.f)));
-
-      blob_data[kk / 2] = vi0 | (vi1 << 4);
-    }
-  }
-};
-
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h
deleted file mode 100644
index 8811e5649fc19..0000000000000
--- a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h
+++ /dev/null
@@ -1,174 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "blockwise_quant_block.h"
-
-#include <vector>
-
-#include "core/common/safeint.h"
-#include "core/framework/float16.h"
-#include "core/platform/threadpool.h"
-#include <iostream>
-
-namespace onnxruntime {
-namespace contrib {
-
-template <typename T, int32_t block_size, int32_t bits>
-void QuantizeBlockwise(
-    uint8_t* dst,          // shape: [ N, block_per_K, block_blob_size ]
-    const T* src,          // shape: [K, N]
-    T* scale,              // shape: [N * block_per_K]
-    uint8_t* zero_points,  // shape: [N * block_per_K] if bits > 4 else [(N *block_per_K + 1) / 2]
-    int32_t N,
-    int32_t K,
-    onnxruntime::concurrency::ThreadPool* thread_pool) {
-  BlockwiseQuantBlock<T, block_size, bits>* dst_blob =
-      reinterpret_cast<BlockwiseQuantBlock<T, block_size, bits>*>(dst);
-
-  int32_t block_per_K = (K + block_size - 1) / block_size;
-  int32_t total_block_count = N * block_per_K;
-
-  std::vector<uint8_t> zero_points_tmp;  // to avoid race condition
-  (void)zero_points_tmp;
-  uint8_t* zero_points_tmp_ptr = zero_points;
-  if (bits <= 4 && zero_points != nullptr) {
-    zero_points_tmp.resize(total_block_count, 0);
-    zero_points_tmp_ptr = zero_points_tmp.data();
-  }
-
-  concurrency::ThreadPool::TryBatchParallelFor(
-      thread_pool,
-      total_block_count,
-      [&](ptrdiff_t block_idx) {
-        int32_t n = static_cast<int32_t>(block_idx / block_per_K);
-        int32_t k_block_idx = static_cast<int32_t>(block_idx % block_per_K);
-        int32_t k = k_block_idx * block_size;
-        BlockwiseQuantBlock<T, block_size, bits>* blob_ptr = dst_blob + block_idx;
-        size_t offset = SafeInt<size_t>(k) * N + n;
-        if (nullptr != zero_points_tmp_ptr) {
-          blob_ptr->quant(src + offset, scale[block_idx], zero_points_tmp_ptr[block_idx], k, K, N);
-        } else {
-          blob_ptr->quant(src + offset, scale[block_idx], k, K, N);
-        }
-      },
-      0);
-
-  if (bits <= 4 && zero_points != nullptr) {  // compact zero points
-    for (int32_t zp_idx = 0; zp_idx < total_block_count / 2; zp_idx++) {
-      zero_points[zp_idx] = ((zero_points_tmp[zp_idx * 2]) | (zero_points_tmp[zp_idx * 2 + 1] << 4));
-    }
-    if (total_block_count & 1) {
-      zero_points[total_block_count / 2] = (zero_points[total_block_count / 2] & 0xf0) | zero_points_tmp[total_block_count - 1];
-    }
-  }
-}
-
-#define QuantizeBlockwise4Bits(block_size) \
-  QuantizeBlockwise<T, block_size, 4>(dst, src, scale, zero_points, N, K, thread_pool);
-
-template <typename T>
-void QuantizeBlockwise(
-    uint8_t* dst,          // shape: [ N, block_per_K, block_blob_size ]
-    const T* src,          // shape: [K, N]
-    T* scale,              // shape: [N, block_per_K]
-    uint8_t* zero_points,  // shape: [N, block_per_K]
-    int32_t block_size,
-    int32_t bits,
-    int32_t N,
-    int32_t K,
-    onnxruntime::concurrency::ThreadPool* thread_pool) {
-  ORT_ENFORCE(bits == 4, "only 4 bits is supported now");
-
-  if (16 == block_size) {
-    QuantizeBlockwise4Bits(16);
-  } else if (32 == block_size) {
-    QuantizeBlockwise4Bits(32);
-  } else if (64 == block_size) {
-    QuantizeBlockwise4Bits(64);
-  } else if (128 == block_size) {
-    QuantizeBlockwise4Bits(128);
-  } else if (256 == block_size) {
-    QuantizeBlockwise4Bits(256);
-  } else {
-    ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported.");
-  }
-}
-
-#undef QuantizeBlockwise4Bits
-
-template <typename T, int32_t block_size, int32_t bits>
-void DequantizeBlockwise(
-    T* dst,                      // shape: [N, K]
-    const uint8_t* src,          // shape: [N, block_per_K, block_blob_size]
-    const T* scale,              // shape: [N, block_per_K]
-    const uint8_t* zero_points,  // shape: [N, block_per_K] if bits > 4 else [N, (block_per_K + 1) / 2]
-    int32_t N,
-    int32_t K,
-    onnxruntime::concurrency::ThreadPool* thread_pool) {
-  int32_t block_per_K = (K + block_size - 1) / block_size;
-  int32_t task_count = N * block_per_K;
-
-  const BlockwiseQuantBlock<T, block_size, bits>* src_blob =
-      reinterpret_cast<const BlockwiseQuantBlock<T, block_size, bits>*>(src);
-
-  concurrency::ThreadPool::TryBatchParallelFor(
-      thread_pool,
-      task_count,
-      [&](ptrdiff_t task_idx) {
-        int32_t n = static_cast<int32_t>(task_idx / block_per_K);
-        int32_t k_block_idx = static_cast<int32_t>(task_idx % block_per_K);
-        int32_t k = k_block_idx * block_size;
-        const BlockwiseQuantBlock<T, block_size, bits>* blob_ptr = src_blob + task_idx;
-        size_t offset = SafeInt<size_t>(n) * K + k;
-        if (nullptr != zero_points) {
-          if constexpr (bits > 4) {  // zero point is stored with a byte
-            blob_ptr->dequant(dst + offset, scale[task_idx], zero_points[task_idx], k, K);
-          } else {  // zero points is stored with 4bits
-            uint8_t zp = zero_points[task_idx / 2];
-            zp = (task_idx & 1) ? (zp >> 4) : (zp & 0xf);
-            blob_ptr->dequant(dst + offset, scale[task_idx], zp, k, K);
-          }
-        } else {
-          blob_ptr->dequant(dst + offset, scale[task_idx], k, K);
-        }
-      },
-      0);
-}
-
-#define DequantizeBlockwise4Bits(block_size) \
-  DequantizeBlockwise<T, block_size, 4>(dst, src, scale, zero_points, N, K, thread_pool);
-
-template <typename T>
-void DequantizeBlockwise(
-    T* dst,                      // [N, K]
-    const uint8_t* src,          // [N, block_per_K, block_blob_size]
-    const T* scale,              // [N, block_per_K]
-    const uint8_t* zero_points,  // [N, block_per_K]
-    int32_t block_size,
-    int32_t bits,
-    int32_t N,
-    int32_t K,
-    onnxruntime::concurrency::ThreadPool* thread_pool) {
-  ORT_ENFORCE(bits == 4, "only 4 bits is supported now");
-
-  if (16 == block_size) {
-    DequantizeBlockwise4Bits(16);
-  } else if (32 == block_size) {
-    DequantizeBlockwise4Bits(32);
-  } else if (64 == block_size) {
-    DequantizeBlockwise4Bits(64);
-  } else if (128 == block_size) {
-    DequantizeBlockwise4Bits(128);
-  } else if (256 == block_size) {
-    DequantizeBlockwise4Bits(256);
-  } else {
-    ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported.");
-  }
-}
-
-#undef DequantizeBlockwise4Bits
-
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 57aada94be39c..c72d811170a27 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -5,8 +5,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
-#include "dequantize_blockwise.h"
-#include "core/mlas/inc/mlas.h"
+#include "core/mlas/inc/mlas_q4.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -18,6 +17,9 @@ class MatMulNBits final : public OpKernel {
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
+    ORT_ENFORCE(nbits_ == 4,
+                "Only 4b quantization is supported for MatMulNBits op,"
+                " additional bits support is planned.");
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -27,6 +29,7 @@ class MatMulNBits final : public OpKernel {
   int64_t N_;
   int64_t block_size_;
   int64_t nbits_;
+  bool column_wise_quant_{true};
 };
 
 Status MatMulNBits::Compute(OpKernelContext* ctx) const {
@@ -46,15 +49,18 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   auto status = ctx->GetTempSpaceAllocator(&allocator);
   ORT_RETURN_IF_ERROR(status);
   auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
-  DequantizeBlockwise<float>(tmp_b_data_ptr.get(),
-                             b_data,
-                             scales_data,
-                             zero_points_data,
-                             static_cast<int32_t>(block_size_),
-                             static_cast<int32_t>(nbits_),
-                             static_cast<int32_t>(N_),
-                             static_cast<int32_t>(K_),
-                             thread_pool);
+
+  // dequantize b, only 4b quantization is supported for now
+  MlasDequantizeBlockwise<float, 4>(
+      tmp_b_data_ptr.get(),               // dequantized output
+      b_data,                             // quantized input
+      scales_data,                        // quantization scales
+      zero_points_data,                   // quantization zero points
+      static_cast<int32_t>(block_size_),  // quantization block size
+      column_wise_quant_,                 // columnwise quantization or row-wise
+      static_cast<int32_t>(K_),           // number of rows in quantized input
+      static_cast<int32_t>(N_),           // number of columns in quantized input
+      thread_pool);
 
 #if 0  // for debug
   auto tm_b_data_ptr_trans = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
index 8c328d00b44d0..7921315ab52e1 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
@@ -18,6 +18,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
+
 __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, half scale, half zp, half* output) {
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
@@ -61,15 +62,19 @@ __global__ void Dequantize4BitsKernel(
     const T* scale_data,
     const uint8_t* zero_points,
     int block_size,
+    int blocks_per_K,
     int blocks_per_threadblock,
     int shift) {
   int block_id = blockIdx.x * blocks_per_threadblock + ((threadIdx.x * 8) >> shift);
+  int n_idx = block_id / blocks_per_K;
+  int kb_idx = block_id % blocks_per_K;
   int element_offset = block_id * block_size + ((threadIdx.x * 8) & ((1 << shift) - 1));
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
   T scale = *(scale_data + block_id);
   uint8_t zp = 8;
   if (zero_points) {
-    zp = (block_id & 0x01) ? (zero_points[block_id / 2] >> 4) : (zero_points[block_id / 2] & 0x0f);
+    zp = zero_points[n_idx * ((blocks_per_K + 1)/2) + kb_idx / 2];
+    zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f);
   }
 
   output = output + element_offset;
@@ -100,6 +105,7 @@ Status Dequantize4Bits(
       scales_data,
       zero_points,
       block_size,
+      blocks_per_K,
       blocks_per_threadblock,
       shift);
 
@@ -126,6 +132,244 @@ template Status Dequantize4Bits<half>(
     int block_size,
     cudaStream_t stream);
 
+
+///////////////////////////////////////////////////////////////////////////////
+// A more general block-wise dequantization implementation that supports
+// different block sizes and block orientations (row-wise/column-wise).
+
+template <
+  int Row_,    ///< rows of a matrix
+  int Column_  ///< columns of a matrix
+  >
+struct Shape2D {
+  static int const kRow = Row_;              ///< rows of a matrix
+  static int const kColumn = Column_;        ///< columns of a matrix
+  static int const kCount = Row_ * Column_;  ///< total number of elements in a matrix
+};
+
+/**
+ * @brief Blockwise quantization constants
+ * @tparam ElementT       source data type, e.g. fp32/fp16
+ * @tparam block_size     number of elemenets quantized together
+ * @tparam qbits          number of bits in each quantized element
+ * @tparam Columnwise     true:  elements in a block come from one single column
+ *                        false: elements in a block come from one single row
+ */
+template <
+  typename ElementT,
+  int32_t block_size,
+  int32_t qbits,
+  bool Columnwise>
+struct BlkQuantTraits {
+  // number of qbit elements to pack into whole bytes
+  static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0;
+  static_assert(kPackSize != 0, "Packing to whole bytes not supported for this qbits!");
+
+  using QuantBlk = std::conditional_t<Columnwise, Shape2D<block_size, 1>, Shape2D<1, block_size>>;
+  using ThreadBlk = Shape2D<QuantBlk::kRow * kPackSize, QuantBlk::kColumn>;
+};
+
+template <
+  typename ElementT,
+  int32_t block_size,
+  int32_t qbits,
+  bool Columnwise>
+__global__
+void dequantizeThread(ElementT* dst,
+                      const uint8_t* weights,
+                      const ElementT* scales,
+                      const uint8_t* zero_points,
+                      int rows,
+                      int columns,
+                      int thrd_row_blks) {
+  using QuantBlk = typename BlkQuantTraits<ElementT, block_size, qbits, Columnwise>::QuantBlk;
+  using ThreadBlk = typename BlkQuantTraits<ElementT, block_size, qbits, Columnwise>::ThreadBlk;
+
+  // !! 4b specific code
+  static_assert(qbits == 4, "Only 4b block quantization is supported!");
+
+  const auto block_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto row_blks = (rows + QuantBlk::kRow - 1) / QuantBlk::kRow;
+
+  const auto meta_rows = (rows + QuantBlk::kRow - 1) / QuantBlk::kRow;
+
+  // quantized matrix is stored in column major, packed by column
+  const auto q_rows = (meta_rows * QuantBlk::kRow * qbits + 7) / 8;
+
+  int32_t r_blk_idx = static_cast<int32_t>(block_idx % thrd_row_blks);
+  int32_t c_blk_idx = static_cast<int32_t>(block_idx / thrd_row_blks);
+
+  int32_t r = r_blk_idx * ThreadBlk::kRow;
+  int32_t c = c_blk_idx * ThreadBlk::kColumn;
+
+  int32_t r_end = std::min(r + ThreadBlk::kRow, rows);
+  int32_t c_end = std::min(c + ThreadBlk::kColumn, columns);
+
+  // for 4b quant, kPackSize = 2, so we have 2 scales and 2 offsets
+  const ElementT scale_buf[2] = {
+      scales[(c / QuantBlk::kColumn) * row_blks + r / QuantBlk::kRow],
+      ((r/QuantBlk::kRow) < (meta_rows - 1))
+          ? scales[(c / QuantBlk::kColumn) * row_blks + r / QuantBlk::kRow + 1]
+          : static_cast<ElementT>(0.0f)};
+  const uint8_t zp_pair = (zero_points == nullptr)
+        ? 0x88
+        : zero_points[(c / QuantBlk::kColumn) * ((row_blks + 1) / 2) + (r / QuantBlk::kRow) / 2];
+  const uint16_t zp_buf[2] = {(uint16_t)(zp_pair & 0x0f), (uint16_t)((zp_pair >> 4) & 0x0f)};
+  const ElementT adjust_buf[2] = {(-scale_buf[0]) * static_cast<ElementT>(zp_buf[0]),
+                                  (-scale_buf[1]) * static_cast<ElementT>(zp_buf[1])};
+
+  for (int32_t j = c; j < c_end; ++j) {
+    const uint8_t* q_ptr = weights + j * q_rows;
+    for (int32_t i = r; i < (r_end - 1); i += 2) {
+      const auto scale0 = scale_buf[(i - r) / QuantBlk::kRow];
+      const auto adjust0 = adjust_buf[(i - r) / QuantBlk::kRow];
+
+      const auto scale1 = scale_buf[(i + 1 - r) / QuantBlk::kRow];;
+      const auto adjust1 = adjust_buf[(i + 1 - r) / QuantBlk::kRow];
+
+      const auto vi = q_ptr[i / 2];
+
+      if constexpr (std::is_same<ElementT, half>::value) {
+        half2 scale_half2 = {scale0, scale1};
+        half2 zp_adjust2 = {adjust0, adjust1};
+
+        half2 v = {__ushort2half_rn(vi & 0xF), __ushort2half_rn((vi >> 4) & 0xF)};
+        half2 results = v * scale_half2 + zp_adjust2;
+
+        dst[j * rows + i] = results.x;
+        dst[j * rows + (i + 1)] = results.y;
+      } else {
+        static_assert(std::is_same<ElementT, float>::value, "Only float and half are supported!");
+        const uint8_t vi0 = vi & 0xf;
+        const uint8_t vi1 = vi >> 4;
+        dst[j * rows + i] = static_cast<float>(vi0) * scale0 + adjust0;;
+        dst[j * rows + (i + 1)] = static_cast<float>(vi1) * scale1 + adjust1;
+      }
+    }
+
+    if ((r_end & 1) && (r_end > r)) {
+      const auto scale0 = scale_buf[(r_end - 1 - r) / QuantBlk::kRow];
+      const auto adjust0 = adjust_buf[(r_end - 1 - r) / QuantBlk::kRow];
+
+      const auto vi = q_ptr[(r_end - 1) / 2];
+      const uint8_t vi0 = vi & 0xf;
+
+      dst[j * rows + (r_end - 1)] = static_cast<ElementT>(vi0) * scale0 + adjust0;
+    }
+  }
+}
+
+template <
+  typename ElementT,
+  int32_t block_size,
+  int32_t qbits,
+  bool Columnwise>
+static void dequantize(ElementT* dst, const uint8_t* weights, const ElementT* scales,
+                        const uint8_t* zero_points, int32_t rows, int32_t columns,
+                        cudaStream_t stream) {
+  using QuantBlk = typename BlkQuantTraits<ElementT, block_size, qbits, Columnwise>::QuantBlk;
+  using ThreadBlk = typename BlkQuantTraits<ElementT, block_size, qbits, Columnwise>::ThreadBlk;
+
+  // Thread partitioning
+  const auto thrd_row_blks = (rows + ThreadBlk::kRow - 1) / ThreadBlk::kRow;
+  const auto thrd_col_blks = (columns + ThreadBlk::kColumn - 1) / ThreadBlk::kColumn;
+  const auto total_thrd_blks = thrd_row_blks * thrd_col_blks;
+
+  const auto grids = (total_thrd_blks + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
+  dequantizeThread<ElementT, block_size, qbits, Columnwise><<<grids, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      dst,
+      weights,
+      scales,
+      zero_points,
+      rows,
+      columns,
+      thrd_row_blks);
+}
+
+
+template <typename T>
+Status
+DequantizeBlockwise4b(
+    T* dst,
+    const uint8_t* src,
+    const T* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    cudaStream_t stream) {
+  switch (block_size) {
+    case 16:
+      if (columnwise) {
+        dequantize<T, 16, 4, true>(dst, src, scales, zero_points, rows, columns, stream);
+      } else {
+        dequantize<T, 16, 4, false>(dst, src, scales, zero_points, rows, columns, stream);
+      }
+      return Status::OK();
+    case 32:
+      if (columnwise) {
+        dequantize<T, 32, 4, true>(dst, src, scales, zero_points, rows, columns, stream);
+      } else {
+        dequantize<T, 32, 4, false>(dst, src, scales, zero_points, rows, columns, stream);
+      }
+      return Status::OK();
+    case 64:
+      if (columnwise) {
+        dequantize<T, 64, 4, true>(dst, src, scales, zero_points, rows, columns, stream);
+      } else {
+        dequantize<T, 64, 4, false>(dst, src, scales, zero_points, rows, columns, stream);
+      }
+      return Status::OK();
+    case 128:
+      if (columnwise) {
+        dequantize<T, 128, 4, true>(dst, src, scales, zero_points, rows,
+                                                        columns, stream);
+      } else {
+        dequantize<T, 128, 4, false>(dst, src, scales, zero_points,
+                                                            rows, columns, stream);
+      }
+      return Status::OK();
+    case 256:
+      if (columnwise) {
+        dequantize<T, 256, 4, true>(dst, src, scales, zero_points, rows,
+                                                        columns, stream);
+      } else {
+        dequantize<T, 256, 4, false>(dst, src, scales, zero_points,
+                                                            rows, columns, stream);
+      }
+      return Status::OK();
+    default:
+      // Only block size 16, 32, 64, 128, 256 are supported.
+      return Status(::onnxruntime::common::ONNXRUNTIME, ::onnxruntime::common::FAIL,
+                    "Unsupported block size for blockwise quantization.");
+  }
+}
+
+template
+Status DequantizeBlockwise4b<float>(
+    float* dst,
+    const uint8_t* src,
+    const float* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    cudaStream_t stream);
+
+template
+Status DequantizeBlockwise4b<half>(
+    half* dst,
+    const uint8_t* src,
+    const half* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    cudaStream_t stream);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
index 741ce1e735b42..f9c09c55fd893 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh
@@ -18,6 +18,33 @@ Status Dequantize4Bits(
     int block_size,
     cudaStream_t stream);
 
+
+/**
+ * @brief Dequantize a block-wise quantized matrix, and store the result in a
+ *        column major matrix for use in subsequent GEMM. This implementation supports
+ *        columnwise and rowwise block orientation.
+ * @param[out] dst           pointer to the dequantized matrix, column major: [columns, rows]
+ * @param[in]  qelements     pointer to the quantized elements, column major: [columns, rows]
+ * @param[in]  scales        pointer to the scales of quantized blocks, column major layout
+ * @param[in]  zero_points   pointer to the zero points of quantized blocks, packed column major
+ *                           scales
+ * @param[in]  block_size    size of the quantized block
+ * @param[in]  columnwise    whether the quantized matrix is columnwise or rowwise quantized
+ * @param[in]  rows
+ * @param[in]  columns
+ */
+template <typename T>
+Status DequantizeBlockwise4b(
+    T* dst,
+    const uint8_t* qelements,
+    const T* scales,
+    const uint8_t* zero_points,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    cudaStream_t stream);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
index 14a8163fef500..5b0e61e197014 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
@@ -27,6 +27,9 @@ class MatMulNBits final : public CudaKernel {
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("bits", &nbits_));
+    ORT_ENFORCE(nbits_ == 4,
+                "Only 4b quantization is supported for MatMulNBits op,"
+                " additional bits support is planned.");
   }
 
   Status ComputeInternal(OpKernelContext* context) const override;
@@ -36,6 +39,7 @@ class MatMulNBits final : public CudaKernel {
   int64_t N_;
   int64_t block_size_;
   int64_t nbits_;
+  bool column_wise_quant_blk_{true};
 };
 
 template <typename T>
@@ -50,8 +54,6 @@ Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
   const auto* scales_data = scales->Data<T>();
   const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
 
-  ORT_ENFORCE(nbits_ == 4, "only 4 bits is supported now");
-
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   constexpr bool transa = false;
@@ -81,14 +83,32 @@ Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
     int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_;
     IAllocatorUniquePtr<T> b_data_ptr = GetScratchBuffer<T>(N_ * K_padded, ctx->GetComputeStream());
     auto* b_data = b_data_ptr.get();
-    ORT_RETURN_IF_ERROR(Dequantize4Bits(reinterpret_cast<CudaT*>(b_data),
-                                        blob_data,
-                                        reinterpret_cast<const CudaT*>(scales_data),
-                                        zero_points_data,
-                                        SafeInt<int>(K_padded),
-                                        SafeInt<int>(N_),
-                                        SafeInt<int>(block_size_),
-                                        static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+    if (column_wise_quant_blk_) {
+      // column-wise block
+      ORT_RETURN_IF_ERROR(Dequantize4Bits(
+          reinterpret_cast<CudaT*>(b_data),
+          blob_data,
+          reinterpret_cast<const CudaT*>(scales_data),
+          zero_points_data,
+          SafeInt<int>(K_padded),
+          SafeInt<int>(N_),
+          SafeInt<int>(block_size_),
+          static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+    } else {
+      // row-wise block
+      K_padded = K_;
+
+      ORT_RETURN_IF_ERROR(DequantizeBlockwise4b(
+          reinterpret_cast<CudaT*>(b_data),
+          blob_data,
+          reinterpret_cast<const CudaT*>(scales_data),
+          zero_points_data,
+          SafeInt<int>(block_size_),
+          column_wise_quant_blk_,
+          SafeInt<int>(K_),
+          SafeInt<int>(N_),
+          static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle())));
+    }
 #if 0
   cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
   T* b_data_cpu = new T[K_ * N_];
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
index 4c3c345076416..f2600a506285d 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
@@ -96,6 +96,9 @@ __global__ void MatMulFloatInt4Kernel(
   constexpr int k_per_iter = 256;
   int k_iter = k / k_per_iter;
 
+  // blocks_per_k is the number of scales and zero points on the k dim
+  const int b_zp_k = (blocks_per_K + 1)/ 2;
+
   extern __shared__ char shared_buffer[];
 
   // load scale to shared buffer
@@ -105,30 +108,39 @@ __global__ void MatMulFloatInt4Kernel(
   for (int i = thread_id; i < kColsPerThreadBlock * blocks_per_K; i += kColsPerThreadBlock * kWarpSize) {
     b_scale_vec[i] = scales_data[offset + i];
   }
-  for (int i = thread_id; i < kColsPerThreadBlock * blocks_per_K / 2; i += kColsPerThreadBlock * kWarpSize) {
-    b_zp_vec[i] = zero_points != nullptr ? zero_points[offset / 2 + i] : uint8_t(0x88);
+
+  int zp_offset = n_block_id * kColsPerThreadBlock * b_zp_k;
+  for (int i = thread_id; i < kColsPerThreadBlock * b_zp_k; i += kColsPerThreadBlock * kWarpSize) {
+    b_zp_vec[i] = zero_points != nullptr ? zero_points[zp_offset + i] : uint8_t(0x88);
   }
   __syncthreads();
 
   a_data += m_id * k;
   b_data_quant += n_id * blocks_per_K * (block_size / 2);
 
+  const int scale_col_offset = warp_id * blocks_per_K;
+  const int zp_col_offset = warp_id * b_zp_k;
+
   float sum = 0.f;
   int k_id = 0;
   for (; k_id < (k & 0xffffff00); k_id += k_per_iter) {
-    uint32_t value = *(reinterpret_cast<const uint32_t*>(b_data_quant + (k_id >> 1) + lane_id * 4));
-    int32_t block_idx = warp_id * blocks_per_K + (k_id + lane_id * 8) / block_size;
-    T scale = b_scale_vec[block_idx];
-    uint8_t zp = (block_idx & 0x01) ? (b_zp_vec[block_idx / 2] >> 4) : (b_zp_vec[block_idx / 2] & 0x0f);
+    const int t_k = k_id + (lane_id << 3);  // k index for this thread
+    const int t_meta_k = t_k / block_size;  // k index for this thread, points to the scale and zero point
+    uint32_t value = *(reinterpret_cast<const uint32_t*>(b_data_quant + (t_k >> 1)));
+    T scale = b_scale_vec[scale_col_offset + t_meta_k];
+    uint8_t zp = b_zp_vec[zp_col_offset + t_meta_k/2];
+    zp = (t_meta_k & 0x01) ? (zp >> 4) : (zp & 0x0f);
     sum += AccumulateEightElements(value, scale, zp, a_data + k_id + (lane_id << 3));
   }
 
   // handle reminder
   if (k_id + lane_id * 8 < k) {
+    const int t_k = k_id + (lane_id << 3);  // k index for this thread
+    const int t_meta_k = t_k / block_size;  // k index for this thread, points to the scale and zero point
     uint32_t value = *(reinterpret_cast<const uint32_t*>(b_data_quant + k_iter * 128 + lane_id * 4));
-    int32_t block_idx = warp_id * blocks_per_K + (k_id + lane_id * 8) / block_size;
-    T scale = b_scale_vec[block_idx];
-    uint8_t zp = (block_idx & 0x01) ? (b_zp_vec[block_idx / 2] >> 4) : (b_zp_vec[block_idx / 2] & 0x0f);
+    T scale = b_scale_vec[scale_col_offset + t_meta_k];
+    uint8_t zp = b_zp_vec[zp_col_offset + t_meta_k/2];
+    zp = (t_meta_k & 0x01) ? (zp >> 4) : (zp & 0x0f);
     sum += AccumulateEightElements(value, scale, zp, a_data + k_id + (lane_id << 3));
   }
 
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index c4eef5b27c1bb..b2ef853119588 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -62,8 +62,13 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
 
   auto create_error_message = [&node, &status](const std::string& prefix) {
     std::ostringstream errormsg;
-    errormsg << prefix << node.OpType() << "(" << node.SinceVersion() << ")";
-    errormsg << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). ";
+    errormsg << prefix;
+    const auto& domain = node.Domain();
+    if (!domain.empty()) {
+      errormsg << domain << ".";
+    }
+    errormsg << node.OpType() << "(" << node.SinceVersion() << ")"
+             << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). ";
     if (!status.IsOK())
       errormsg << status.ErrorMessage();
 
diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
index d3fc5873cb274..03ad95260c0ad 100644
--- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
+++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
@@ -90,12 +90,16 @@ void RegisterNHWCSchemaWithActivation(const RegistrationFunc& f, ::ONNX_NAMESPAC
 void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMESPACE::OpSchema&&)>& fn) {
   // if the operator may be fused with an activation, use the WITH_ACTIVATION variant to add optional attributes
   // for the activation parameters.
-  // For now we only register operators from opset 11 on. Models can easily have their opset updated using ONNX tools
+  // We mainly register operators from opset 11 on . Models can easily have their opset updated using ONNX tools
   // so supporting older opsets is unnecessary.
+  // Older opsets are included on a per-operator basis as needed.
 
   // NOTE: This should be in sync with GetLayoutSensitiveOps in
   // /onnxruntime/core/optimizer/transpose_optimization/transpose_optimizer.cc
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 7);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 10);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 11);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 19);
 
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 9);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 14);
@@ -106,16 +110,18 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
 
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, InstanceNormalization, 6);
 
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, Conv, 1);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, Conv, 11);
 
-  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, ConvTranspose, 11);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, ConvTranspose, 1);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, ConvTranspose, 11);
 
   REGISTER_NHWC_SCHEMA(fn, GlobalAveragePool, 1);
   REGISTER_NHWC_SCHEMA(fn, GlobalLpPool, 2);
   REGISTER_NHWC_SCHEMA(fn, GlobalMaxPool, 1);
 
   REGISTER_NHWC_SCHEMA(fn, GridSample, 16);
+  REGISTER_NHWC_SCHEMA(fn, GridSample, 20);
 
   REGISTER_NHWC_SCHEMA(fn, LRN, 1);
   REGISTER_NHWC_SCHEMA(fn, LRN, 13);
@@ -123,6 +129,9 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   REGISTER_NHWC_SCHEMA(fn, LpPool, 11);
   REGISTER_NHWC_SCHEMA(fn, LpPool, 18);
 
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, MaxPool, 1);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, MaxPool, 8);
+  REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, MaxPool, 10);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, MaxPool, 11);
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, MaxPool, 12);
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index cab9467501f55..4b3cafcb39b78 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3908,7 +3908,8 @@ Node& Graph::CreateFusedSubGraphNode(const IndexedSubGraph& sub_graph, const std
   // kernel lookup works as per usual, if not using an existing schema.
   if (sub_graph.schema_source == IndexedSubGraph::SourceOfSchema::EXISTING) {
     ORT_ENFORCE(SetOpSchemaFromRegistryForNode(fused_node),
-                "Schema was not found for fused node. Domain:", fused_node.Domain(), " OpType:", fused_node.OpType());
+                "Schema was not found for fused node. Domain:", fused_node.Domain(), " OpType:", fused_node.OpType(),
+                " SinceVersion:", fused_node.SinceVersion());
   } else if (IndexedSubGraph::SourceOfSchema::REUSE_OR_CREATE == sub_graph.schema_source) {
     auto schema_key = GenerateSchemaKey(sub_graph);
     if (reusable_fused_schema_map_.count(schema_key) == 0) {
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 076332a65c8f2..b3935e69ad7b1 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -232,6 +232,14 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
     }
   }
 
+  // special-case the internal NHWC domain as it must match the ONNX opset if not explicitly imported
+  if (domain_to_version.find(kMSInternalNHWCDomain) == domain_to_version.end()) {
+    auto onnx_version = domain_to_version.find(kOnnxDomain);
+    if (onnx_version != domain_to_version.end()) {
+      domain_to_version[kMSInternalNHWCDomain] = onnx_version->second;
+    }
+  }
+
   auto domain_map = allow_official_onnx_release_only_final
                         ? schema_registry->GetLastReleasedOpsetVersions(false)
                         : schema_registry->GetLatestOpsetVersions(false);
diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index f3bc2a2434ab3..7c7b729117e4a 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -39,7 +39,7 @@ typedef enum {
  * @brief Computes the number of bytes required to pack and int4-quantize
  *        a weight matrix
  * @param QType  type of block quantization
- * @param N      the number of columns of matrix B. 
+ * @param N      the number of columns of matrix B.
  * @param K      the number of rows of matrix B.
  * @return size of the packing buffer, 0 if the operation is not yet supported.
 */
@@ -53,11 +53,11 @@ MlasQ4GemmPackBSize(
 
 /**
  * @brief Prepack and Quantize fp32 weight tensor to int4 blocks
- * 
+ *
  * @param QType      type of block quantization
  * @param PackedBuf  destination buffer
  * @param FpData     the pointer to fp32 matrix
- * @param N          the number of columns of matrix B. 
+ * @param N          the number of columns of matrix B.
  * @param K          the number of rows of matrix B.
  * @param ldb        leading dimension of B
 */
@@ -257,14 +257,14 @@ MlasBlockwiseQuantMetaShape(
  * matrix shape [rows, columns], compute the shape of the
  * quantized matrix [q_rows, q_cols]. The quantized matrix
  * is in column major layout, with bits packed on the column.
- * 
- * @tparam T 
- * @param block_size 
- * @param columnwise 
- * @param rows 
- * @param columns 
- * @param q_rows 
- * @param q_cols 
+ *
+ * @tparam T
+ * @param block_size
+ * @param columnwise
+ * @param rows
+ * @param columns
+ * @param q_rows
+ * @param q_cols
 */
 template <typename T>
 void
@@ -283,21 +283,22 @@ MlasBlockwiseQuantizedShape(
  *        parameters (scales, zero points) are packed into separate matrices
  *        all in column major layout for faster access during subsequent matrix
  *        multiplication.
- * 
+ *
  * @tparam ElementT             type of the input matrix element, usually floating point
- * 
+ * @tparam qbits                number of bits used for quantization, 4 for int4
+ *
  * @param dst                   points to the quantized matrix, shape [rows, columns] column major
- * @param scales                points to the scales matrix, column major 
+ * @param scales                points to the scales matrix, column major
  * @param zero_points           points to the zero_points matrix, column major
  * @param src                   points to the floating point matrix, to be quantized, row major shape [rows, columns]
  * @param block_size            size of the block to quantize, elements from the same block share the same scale and zero point
  * @param columnwise            true when elements in a block are from the same column, false when elements in a block are from the same row
- * @param rows 
- * @param columns 
- * @param leading_dimension 
- * @param thread_pool 
+ * @param rows
+ * @param columns
+ * @param leading_dimension
+ * @param thread_pool
 */
-template <typename ElementT>
+template <typename ElementT, int qbits>
 void
 MlasQuantizeBlockwise(
     uint8_t* dst,
@@ -318,19 +319,21 @@ MlasQuantizeBlockwise(
  *        parameters (scales, zero points) are from separate matrices packed
  *        in column major layout.  Output is a floating point matrix in column
  *        major layout for faster access during subsequent matrix multiplication.
- * 
+ *
  * @tparam ElementT     type of the dequantized matrix element, usually floating point
+ * @tparam qbits        number of bits used for quantization, 4 for int4
+ *
  * @param dst           points to dequantized matrix shape [rows, columns] column major
  * @param src           points to quantized matrix, column major
  * @param scales        points to quantization scales, column major
  * @param zero_points   points to quantization zero points, column major
  * @param block_size    size of the block to quantize, elements from the same block share the same scale and zero point
  * @param columnwise    true when elements in a block are from the same column, false when elements in a block are from the same row
- * @param rows 
- * @param columns 
- * @param thread_pool 
+ * @param rows
+ * @param columns
+ * @param thread_pool
 */
-template <typename ElementT>
+template <typename ElementT, int qbits>
 void
 MlasDequantizeBlockwise(
     ElementT* dst,
diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp
index 24a2212ba0714..fbd1030de8ab7 100644
--- a/onnxruntime/core/mlas/lib/q4_dq.cpp
+++ b/onnxruntime/core/mlas/lib/q4_dq.cpp
@@ -364,7 +364,7 @@ range2scalezp(float min, float max, ScaleT& scale, uint8_t& zp)
     } else {
         zp = (uint8_t)roundf(zero_point_fp);
     }
-    scale = static_cast<ScaleT>(scale_f);
+    scale = ScaleT(scale_f);
 }
 
 template <typename ScaleT, int qbits>
@@ -377,7 +377,7 @@ range2scale(float min, float max, ScaleT& scale)
 
     max = fabsf(max) > fabsf(min) ? max : min;
 
-    scale = static_cast<ScaleT>(max / mid_fp);
+    scale = ScaleT(max / mid_fp);
 };
 
 
@@ -773,7 +773,7 @@ MlasBlockwiseQuantizedShape<float>(
     );
 
 
-template <typename T>
+template <typename T, int qbits>
 void
 MlasQuantizeBlockwise(
     uint8_t* dst,
@@ -791,50 +791,50 @@ MlasQuantizeBlockwise(
     switch (block_size) {
         case 16:
             if (columnwise) {
-                BlockwiseQuantizer<T, 16, 4, true>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 16, qbits, true>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 16, 4, false>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 16, qbits, false>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             }
             break;
 
         case 32:
             if (columnwise) {
-                BlockwiseQuantizer<T, 32, 4, true>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 32, qbits, true>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 32, 4, false>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 32, qbits, false>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             }
             break;
 
         case 64:
             if (columnwise) {
-                BlockwiseQuantizer<T, 64, 4, true>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 64, qbits, true>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 64, 4, false>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 64, qbits, false>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             }
             break;
 
         case 128:
             if (columnwise) {
-                BlockwiseQuantizer<T, 128, 4, true>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 128, qbits, true>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 128, 4, false>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 128, qbits, false>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             }
             break;
 
         case 256:
             if (columnwise) {
-                BlockwiseQuantizer<T, 256, 4, true>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 256, qbits, true>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 256, 4, false>::quantizeAndTranspose(
+                BlockwiseQuantizer<T, 256, qbits, false>::quantizeAndTranspose(
                     dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
             }
             break;
@@ -847,7 +847,7 @@ MlasQuantizeBlockwise(
 
 template
 void
-MlasQuantizeBlockwise<float>(
+MlasQuantizeBlockwise<float, 4>(
     uint8_t* dst,
     float* scales,
     uint8_t* zero_points,
@@ -860,8 +860,23 @@ MlasQuantizeBlockwise<float>(
     MLAS_THREADPOOL* thread_pool
     );
 
+template
+void
+MlasQuantizeBlockwise<MLAS_FP16, 4>(
+    uint8_t* dst,
+    MLAS_FP16* scales,
+    uint8_t* zero_points,
+    const MLAS_FP16* src,
+    int block_size,
+    bool columnwise,
+    int rows,
+    int columns,
+    int leading_dimension,
+    MLAS_THREADPOOL* thread_pool
+    );
+
 
-template <typename T>
+template <typename T, int qbits>
 void
 MlasDequantizeBlockwise(
     T* dst,
@@ -878,46 +893,46 @@ MlasDequantizeBlockwise(
     switch (block_size) {
         case 16:
             if (columnwise) {
-                BlockwiseQuantizer<T, 16, 4, true>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 16, qbits, true>::dequantize(dst, src, scales, zero_points, rows,
                                                                columns, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 16, 4, false>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 16, qbits, false>::dequantize(dst, src, scales, zero_points, rows,
                                                                 columns, thread_pool);
             }
             break;
         case 32:
             if (columnwise) {
-                BlockwiseQuantizer<T, 32, 4, true>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 32, qbits, true>::dequantize(dst, src, scales, zero_points, rows,
                                                                columns, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 32, 4, false>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 32, qbits, false>::dequantize(dst, src, scales, zero_points, rows,
                                                                 columns, thread_pool);
             }
             break;
         case 64:
             if (columnwise) {
-                BlockwiseQuantizer<T, 64, 4, true>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 64, qbits, true>::dequantize(dst, src, scales, zero_points, rows,
                                                                columns, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 64, 4, false>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 64, qbits, false>::dequantize(dst, src, scales, zero_points, rows,
                                                                 columns, thread_pool);
             }
             break;
         case 128:
             if (columnwise) {
-                BlockwiseQuantizer<T, 128, 4, true>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 128, qbits, true>::dequantize(dst, src, scales, zero_points, rows,
                                                                 columns, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 128, 4, false>::dequantize(dst, src, scales, zero_points,
+                BlockwiseQuantizer<T, 128, qbits, false>::dequantize(dst, src, scales, zero_points,
                                                                  rows, columns, thread_pool);
             }
             break;
         case 256:
             if (columnwise) {
-                BlockwiseQuantizer<T, 256, 4, true>::dequantize(dst, src, scales, zero_points, rows,
+                BlockwiseQuantizer<T, 256, qbits, true>::dequantize(dst, src, scales, zero_points, rows,
                                                                 columns, thread_pool);
             } else {
-                BlockwiseQuantizer<T, 256, 4, false>::dequantize(dst, src, scales, zero_points,
+                BlockwiseQuantizer<T, 256, qbits, false>::dequantize(dst, src, scales, zero_points,
                                                                  rows, columns, thread_pool);
             }
             break;
@@ -929,7 +944,7 @@ MlasDequantizeBlockwise(
 
 template
 void
-MlasDequantizeBlockwise<float>(
+MlasDequantizeBlockwise<float, 4>(
     float* dst,
     const uint8_t* src,
     const float* scales,
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 86b126f2c7c31..c1397e92d9d26 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -51,6 +51,7 @@
 #include "core/optimizer/matmul_scale_fusion.h"
 #include "core/optimizer/matmul_transpose_fusion.h"
 #include "core/optimizer/matmul_bn_fusion.h"
+#include "core/optimizer/pad_fusion.h"
 #include "core/optimizer/nchwc_transformer.h"
 #include "core/optimizer/noop_elimination.h"
 #include "core/optimizer/not_where_fusion.h"
@@ -128,6 +129,7 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
       rules.push_back(std::make_unique<ConvAddFusion>());
       rules.push_back(std::make_unique<ConvMulFusion>());
       rules.push_back(std::make_unique<ConvBNFusion>());
+      rules.push_back(std::make_unique<PadFusion>());
       rules.push_back(std::make_unique<MatmulBNFusion>());
       rules.push_back(std::make_unique<ClipQuantFusion>());
       rules.push_back(std::make_unique<ReluQuantFusion>());
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 290380cabb036..4505d4afdf1e0 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -66,17 +66,6 @@ bool ConvertNodeLayout(const api::NodeRef& node) {
   const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();
 
   // handle special cases
-#if defined(USE_XNNPACK)
-  if (node.GetExecutionProviderType() == kXnnpackExecutionProvider) {
-    if (node.OpType() == "Resize") {
-      // XNNPACK supports NCHW and NHWC for Resize so we don't need to use the internal NHWC domain and wrap the Resize
-      // with Transpose nodes. EPAwareHandleResize will allow an NCHW <-> NHWC Transpose to be pushed through
-      // the Resize during transpose optimization.
-      return false;
-    }
-  }
-#endif
-
 #if defined(USE_JSEP)
   // TODO(fs-eire): Remove special case handing of JSEP once NHWC Resize implementation is fixed
   if (node.GetExecutionProviderType() == kJsExecutionProvider) {
diff --git a/onnxruntime/core/optimizer/pad_fusion.cc b/onnxruntime/core/optimizer/pad_fusion.cc
new file mode 100644
index 0000000000000..b25e7618802dd
--- /dev/null
+++ b/onnxruntime/core/optimizer/pad_fusion.cc
@@ -0,0 +1,128 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/pad_fusion.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/utils.h"
+
+namespace onnxruntime {
+
+/*
+ * It matches following pattern:
+ *     Pad
+ *      |
+ *   Conv/MaxPool
+ */
+bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const {
+  // if Pad has input axis, don't fuse it.
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) ||
+      node.GetOutputEdgesCount() != 1 ||
+      node.InputDefs().size() > 3) {
+    return false;
+  }
+
+  if (graph.NodeProducesGraphOutput(node)) {
+    return false;
+  }
+
+  const Node& child_node = *node.OutputNodesBegin();
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Conv", {1, 11}) &&
+      !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "MaxPool", {1, 8, 10, 11, 12})) {
+    return false;
+  }
+
+  // Don't fuse if MaxPool has optional output indices tensor because output indices tensor
+  // does not incorporate pad values. Basically if we allow the fusion, then dimension values
+  // of input tensor < dimension values of input tensor without fusion.
+  // This will cause the range of values for output indices tensor to be less than what it
+  // should have been.
+
+  if (child_node.OutputDefs().size() > 1) {
+    return false;
+  }
+
+  // conv or maxpool node must use explicit padding to perform this fusion.
+  if (child_node.GetAttributes().find("auto_pad") != child_node.GetAttributes().end() &&
+      child_node.GetAttributes().at("auto_pad").s() != "NOTSET") {
+    return false;
+  }
+
+  const NodeAttributes& pad_attributes = node.GetAttributes();
+  if (pad_attributes.find("mode") != pad_attributes.end() &&
+      pad_attributes.at("mode").s() != "constant") {
+    return false;
+  }
+
+  // Since opset 11, <pads> and <constant_value> moved to inputs.
+  // Both of these should be initializer because we have to verify the values.
+  if (node.SinceVersion() >= 11) {
+    if (!graph_utils::NodeArgIsConstant(graph, *node.InputDefs()[1]) ||
+        (node.InputDefs().size() > 2 && !graph_utils::NodeArgIsConstant(graph, *node.InputDefs()[2]))) {
+      return false;
+    }
+
+    // constant_value should be zero because Conv and MaxPool allow only 0 as padding value.
+    if (node.InputDefs().size() > 2) {
+      const auto* pad_constant_value_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[2]->Name());
+      Initializer pad_constant_value{*pad_constant_value_proto, graph.ModelPath()};
+      if (std::any_of(pad_constant_value.DataAsByteSpan().begin(), pad_constant_value.DataAsByteSpan().end(), [](const uint8_t byte) { return byte != 0; })) {
+        return false;
+      }
+    }
+  } else {
+    if (pad_attributes.find("value") != pad_attributes.end() &&
+        pad_attributes.at("value").f() != 0.0) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/*
+ * - For 1st two dimension Pads array's value should be zero and for rest of them values should >= 0
+ */
+Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_effect, const logging::Logger&) const {
+  std::vector<int64_t> pads_values;
+
+  if (pad_node.SinceVersion() >= 11) {
+    const auto* pads_proto = graph_utils::GetConstantInitializer(graph, pad_node.InputDefs()[1]->Name());
+    Initializer pads{*pads_proto, graph.ModelPath()};
+    pads_values.assign(pads.DataAsSpan<int64_t>().begin(), pads.DataAsSpan<int64_t>().end());
+  } else {
+    pads_values.assign(pad_node.GetAttributes().at("pads").ints().begin(), pad_node.GetAttributes().at("pads").ints().end());
+  }
+
+  assert(static_cast<uint32_t>(pads_values.size()) == (2 * static_cast<uint32_t>(pad_node.InputDefs()[0]->Shape()->dim_size())));
+
+  uint32_t pads_size = static_cast<uint32_t>(pads_values.size());
+  // check if padding is applied only on feature dims
+  if (pads_values[0] != 0 || pads_values[1] != 0 || pads_values[pads_size / 2] != 0 ||
+      pads_values[pads_size / 2 + 1] != 0) {
+    return Status::OK();
+  }
+
+  // check if padding is only positive
+  if (std::any_of(pads_values.begin(), pads_values.end(), [](int64_t value) { return value < 0; })) {
+    return Status::OK();
+  }
+
+  Node& child_node = *graph.GetNode(pad_node.OutputNodesBegin()->Index());
+  auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints();
+  uint32_t child_pads_size = static_cast<uint32_t>(child_pads->size());
+
+  for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) {
+    child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]);
+    uint32_t mirrored_child_index = child_index + (child_pads_size / 2);
+    uint32_t mirrored_pad_index = pads_index + (pads_size / 2);
+    child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]);
+  }
+
+  graph_utils::RemoveNodeOutputEdges(graph, pad_node);
+  graph_utils::ReplaceNodeInput(child_node, 0, *pad_node.MutableInputDefs()[0]);
+  graph.RemoveNode(pad_node.Index());
+  rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
+  return Status::OK();
+}
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/pad_fusion.h b/onnxruntime/core/optimizer/pad_fusion.h
new file mode 100644
index 0000000000000..a1b6978a83d1e
--- /dev/null
+++ b/onnxruntime/core/optimizer/pad_fusion.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+/*
+ *   This fusion submerges a Pad operator to it's child
+ *   Conv or MaxPool operator, if and only if PadFusion::SatisfyCondition()
+ *   is true.
+ */
+class PadFusion : public RewriteRule {
+ public:
+  PadFusion() : RewriteRule("Pad_Fusion") {}
+
+  std::vector<std::string> TargetOpTypes() const noexcept override {
+    return {"Pad"};
+  }
+
+ private:
+  bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override;
+
+  Status Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
+};
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
index f4f3505128737..8eaac3d34c3af 100644
--- a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
+++ b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
@@ -17,7 +17,7 @@ static bool EPAwareHandleResize(HandlerArgs& args) {
   // layout. Due to that, only push a Transpose through a Resize once it is assigned and we know it's being handled
   // by an EP that supports multiple layouts. Currently that's the CPU and XNNPACK EPs.
   const auto ep_type = args.node.GetExecutionProviderType();
-  if (ep_type == kCpuExecutionProvider || ep_type == kXnnpackExecutionProvider) {
+  if (ep_type == kCpuExecutionProvider) {
     // allow NCHW <-> NHWC for now. not clear any other sort of transpose has a valid usage in a real model
     int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
     if (rank_int == 4) {
diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc
index b63c0d2161ad5..dfa27f1f44d5a 100644
--- a/onnxruntime/core/providers/cpu/generator/random.cc
+++ b/onnxruntime/core/providers/cpu/generator/random.cc
@@ -428,4 +428,14 @@ template Status MultinomialComputeShared<int64_t>(AllocatorPtr& alloc,
                                                   std::default_random_engine& generator,
                                                   Tensor& Y);
 
+#if !defined(DISABLE_CONTRIB_OPS)
+// used by onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h
+template Status MultinomialComputeShared<int32_t>(AllocatorPtr& alloc,
+                                                  const Tensor& X,
+                                                  const int64_t batch_size,
+                                                  const int64_t num_classes,
+                                                  const int64_t num_samples,
+                                                  std::default_random_engine& generator,
+                                                  Tensor& Y);
+#endif
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 3192c8573c5c0..1d524a90302e7 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -967,7 +967,7 @@ Status Xor::Compute(OpKernelContext* context) const {
       },
       [](BroadcastHelper& per_iter_bh) {
         per_iter_bh.OutputEigen<bool>() =
-            per_iter_bh.EigenInput0<bool>().array() ^ per_iter_bh.EigenInput1<bool>().array();
+            per_iter_bh.EigenInput0<bool>().array() != per_iter_bh.EigenInput1<bool>().array();
       }};
 
   UntypedBroadcastTwo(*context, funcs, 1.0);
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index 99522cdf0759a..0b3ce6f477843 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -352,7 +352,7 @@ class UpsampleBase {
                             (scales.size() == 4 && scales[0] == 1 && scales[3] == 1) ||
                             scales.size() == 3 ||
                             (scales.size() == 5 && scales[0] == 1 && scales[1] == 1),
-                        "'Linear' mode only support:\n"
+                        "'Linear' mode only supports:\n"
                         "  * 2-D inputs or\n"
                         "  * 3-D inputs ('Bilinear', 'Trilinear') or\n"
                         "  * 4-D inputs with the corresponding outermost 2 scale values being 1"
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
index 1db22ac92e527..5c7b7bff1e370 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp
@@ -93,7 +93,7 @@ namespace Dml
 
         onnxruntime::Status Compute(onnxruntime::OpKernelContext* kernelContext) const override
         {
-            ORT_THROW_HR_IF(E_UNEXPECTED, m_subgraphInputs.size() != kernelContext->InputCount());
+            ORT_THROW_HR_IF(E_UNEXPECTED, static_cast<ptrdiff_t>(m_subgraphInputs.size()) != kernelContext->InputCount());
 
             bool recompileNeeded = m_compiledExecutionPlanOperator == nullptr;
 
@@ -159,7 +159,7 @@ namespace Dml
                     if (iter != m_inferredInputShapes.end())
                     {
                         auto tensorShape = *nodeArg->Shape();
-                        ORT_THROW_HR_IF(E_UNEXPECTED, tensorShape.dim_size() != iter->second.NumDimensions());
+                        ORT_THROW_HR_IF(E_UNEXPECTED, tensorShape.dim_size() != static_cast<ptrdiff_t>(iter->second.NumDimensions()));
 
                         for (int i = 0; i < tensorShape.dim_size(); ++i)
                         {
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 8dbd552dd0550..798244d7cb75b 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -238,18 +238,40 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, Whe
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 7, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool);
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Gemm);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, Gemm);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gemm);
@@ -257,17 +279,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gem
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul);
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool);
-
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ArgMax);
@@ -291,11 +302,17 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, Split);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 12, Expand);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Expand);
+
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 19, Resize);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Gather);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gather);
@@ -304,11 +321,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gat
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, GatherElements);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherElements);
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 19, Resize);
-
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 9, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Slice);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Slice);
@@ -322,8 +334,9 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tile);
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 17, LayerNormalization);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization);
+
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, InstanceNormalization);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization);
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Range);
 
@@ -508,18 +521,40 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gemm)>,
@@ -527,17 +562,6 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
-
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
@@ -575,7 +599,7 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Resize)>,
-
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize)>,
@@ -594,8 +618,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tile)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 17, LayerNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, InstanceNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Range)>,
 
diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc
index 68336c996a863..474fd260880ce 100644
--- a/onnxruntime/core/providers/js/operators/conv.cc
+++ b/onnxruntime/core/providers/js/operators/conv.cc
@@ -25,6 +25,13 @@ ONNX_OPERATOR_KERNEL_EX(
     (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
     Conv<false>);
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Conv,
+    kMSInternalNHWCDomain,
+    1, 10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    Conv<true>);
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     Conv,
     kOnnxDomain,
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.cc b/onnxruntime/core/providers/js/operators/conv_transpose.cc
index f7f0ab22b7006..2aaf438f30d4d 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.cc
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.cc
@@ -24,6 +24,13 @@ ONNX_OPERATOR_KERNEL_EX(
     (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
     ConvTranspose<false>);
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    ConvTranspose,
+    kMSInternalNHWCDomain,
+    1, 10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()),
+    ConvTranspose<true>);
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     ConvTranspose,
     kOnnxDomain,
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
index 7fdb4e5d114ea..7df1e483f52a1 100644
--- a/onnxruntime/core/providers/js/operators/pool.cc
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -52,15 +52,20 @@ namespace js {
       Pool<pool_type, is_channels_last>);
 
 POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 7, 9)
 POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 10, 10)
 POOLING_KERNEL(AveragePool, kOnnxDomain, false, AveragePool, 11)
 POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 11)
 POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, AveragePool, 1)
 POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, AveragePool, 1)
 
 POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED(MaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1, 7)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 8, 9)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 10, 10)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 11, 11)
 POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 11, 11)
 POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 12)
diff --git a/onnxruntime/core/providers/js/operators/resize.cc b/onnxruntime/core/providers/js/operators/resize.cc
index 5b2e385777a37..2514ab75dff26 100644
--- a/onnxruntime/core/providers/js/operators/resize.cc
+++ b/onnxruntime/core/providers/js/operators/resize.cc
@@ -51,6 +51,7 @@ namespace js {
   REGISTER_RESIZE_KERNEL(domain, 19);
 
 REGISTER_RESIZE_VERSIONED_10_10_KERNEL(kOnnxDomain);
+REGISTER_RESIZE_VERSIONED_10_10_KERNEL(kMSInternalNHWCDomain);
 REGISTER_RESIZE_KERNEL_DOMAIN(kOnnxDomain);
 REGISTER_RESIZE_KERNEL_DOMAIN(kMSInternalNHWCDomain);
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 2dfdfffe5fa54..fc8c5c357682c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -202,16 +202,8 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
   // Qnn format is begin_0, end_0, begin_1, end_1, ...
   ReArranagePads(pad_amount);
 
-  std::vector<uint32_t> pad_amount_dim{static_cast<uint32_t>(pad_amount.size() / 2), static_cast<uint32_t>(2)};
-  QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT, std::move(pad_amount_dim),
-                                  std::move(pad_amount));
-  param_tensor_names.push_back(multiples_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(multiples_param));
-
-  // Process optional input constant_value
-  if (node_unit.Inputs().size() > 2) {
-    ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2]));
-  }  // constant_value
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
 
   NodeAttrHelper node_helper(node_unit);
   std::string mode = node_helper.Get("mode", "constant");
@@ -220,6 +212,10 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
   if ("constant" == mode) {
     mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_CONSTANT;
   } else if ("reflect" == mode) {
+    for (size_t i = 0; i < input_shape.size(); i++) {
+      ORT_RETURN_IF(pad_amount[i * 2] > input_shape[i] - 1 || pad_amount[(i * 2) + 1] > input_shape[i] - 1,
+                    "Pad amount should not be greater than shape(input[0])[i] - 1");
+    }
     mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_MIRROR_REFLECT;
   } else if ("edge" == mode) {
     mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_EDGE;
@@ -227,10 +223,21 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad mode only support constant.");
   }
 
+  std::vector<uint32_t> pad_amount_dim{static_cast<uint32_t>(pad_amount.size() / 2), static_cast<uint32_t>(2)};
   QnnParamWrapper mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_SCHEME, mode_qnn_scalar);
   param_tensor_names.push_back(mode_param.GetParamTensorName());
   qnn_model_wrapper.AddParamWrapper(std::move(mode_param));
 
+  QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT,
+                                  std::move(pad_amount_dim), std::move(pad_amount));
+  param_tensor_names.push_back(multiples_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(multiples_param));
+
+  // Process optional input constant_value
+  if (node_unit.Inputs().size() > 2) {
+    ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2]));
+  }  // constant_value
+
   ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
                                      std::move(input_names),
                                      std::move(param_tensor_names),
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index a1fc67ff60b6f..7f5ab3a772305 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -365,6 +365,46 @@ std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const {
   return std::unique_lock<OrtMutex>(singleton);
 }
 
+Status GetShapeOfShapeTensor(Ort::ConstValue& input_tensor,
+                             std::vector<int32_t>& shape_values,
+                             nvinfer1::ICudaEngine* trt_engine,
+                             int binding_index,
+                             cudaStream_t stream) {
+  auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+  const auto tensor_shapes = tensor_info.GetShape();
+  const auto tensor_type = tensor_info.GetElementType();
+  nvinfer1::Dims dims = trt_engine->getBindingDimensions(static_cast<int>(binding_index));
+  int nb_dims = dims.nbDims;
+  int shape_size = nb_dims == 0 ? 1 : static_cast<int>(tensor_shapes[0]);  // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension
+  shape_values.resize(shape_size, 1);
+
+  switch (tensor_type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+      auto input = std::make_unique<int32_t[]>(shape_size);
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int32_t>(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+      for (int j = 0; j < shape_size; ++j) {
+        shape_values[j] = input[j];
+      }
+      break;
+    }
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+      auto input = std::make_unique<int64_t[]>(shape_size);
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData<int64_t>(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream));
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+      for (int j = 0; j < shape_size; ++j) {
+        shape_values[j] = static_cast<int32_t>(input[j]);
+      }
+      break;
+    }
+    default: {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT shape tensor data type: " + std::to_string(tensor_type) + " not supported.");
+    }
+  }
+  return Status::OK();
+}
+
 /*
  * Apply TensorRT optimization profile shapes from provider options.
  *
@@ -404,7 +444,7 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
 
     // Shape tensor
     if (input->isShapeTensor()) {
-      auto shape_size = nb_dims;
+      int shape_size = nb_dims == 0 ? 1 : static_cast<int>(profile_min_shapes[input_name][i].size());
       std::vector<int32_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
 
       LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] shape size of this shape tensor is " << shape_size;
@@ -2758,7 +2798,17 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         int nb_dims = dimensions.nbDims;
         if (input_names.count(input_name) == 1) {
           if (trt_engine->isShapeBinding(binding_index)) {
-            trt_context->setInputShapeBinding(binding_index, &tensor_shape_values[input_name][0]);
+            // Get shape of the shape tensor
+            std::vector<int32_t> shape_values;
+            if (!tensor_shape_values[input_name].empty()) {
+              shape_values = tensor_shape_values[input_name];
+            } else {
+              auto status = GetShapeOfShapeTensor(input_tensor, shape_values, trt_engine, binding_index, stream);
+              if (status != Status::OK()) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+              }
+            }
+            trt_context->setInputShapeBinding(binding_index, &shape_values[0]);
           } else {
             for (int j = 0, end = nb_dims; j < end; ++j) {
               dimensions.d[j] = static_cast<int32_t>(tensor_shapes[j]);
diff --git a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
index ea5c75a955cc4..8e7e228f974e6 100644
--- a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc
@@ -7,22 +7,22 @@
 
 #include "core/common/common.h"
 #include "core/framework/op_node_proto_helper.h"
-#include "core/graph/graph_viewer.h"
 #include "core/graph/graph_utils.h"
+#include "core/graph/graph_viewer.h"
 #include "core/providers/common.h"
 #include "core/providers/cpu/nn/pool_attributes.h"
-#include "core/providers/xnnpack/detail/utils.h"
 #include "core/providers/shared/node_unit/node_unit.h"
+#include "core/providers/xnnpack/detail/utils.h"
 
 // each operator provides a helper to check if supported
-#include "core/providers/xnnpack/nn/conv.h"
-#include "core/providers/xnnpack/nn/conv_transpose.h"
-#include "core/providers/xnnpack/nn/max_pool.h"
 #include "core/providers/xnnpack/math/gemm.h"
 #include "core/providers/xnnpack/math/matmul.h"
+#include "core/providers/xnnpack/math/softmax.h"
 #include "core/providers/xnnpack/nn/average_pool.h"
-#include "core/providers/xnnpack/nn/resize.h"
-#include "core/providers/xnnpack/nn/softmax.h"
+#include "core/providers/xnnpack/nn/conv.h"
+#include "core/providers/xnnpack/nn/conv_transpose.h"
+#include "core/providers/xnnpack/nn/max_pool.h"
+#include "core/providers/xnnpack/tensor/resize.h"
 
 namespace onnxruntime {
 namespace xnnpack {
diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
index baca4eef537d7..1a32612981120 100644
--- a/onnxruntime/core/providers/xnnpack/detail/utils.cc
+++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -25,7 +25,7 @@ const char* OpTypeToString(OpComputeType opCtype) {
     case op_compute_type_fp16:
       return "fp16";
     case op_compute_type_qs8_per_channel:
-      return "qc8";
+      return "qs8_qc8w";
     case op_compute_type_qs8:
       return "qs8";
     case op_compute_type_qu8:
diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc
index 24c233e2415ca..f7b736b0ff903 100644
--- a/onnxruntime/core/providers/xnnpack/math/gemm.cc
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc
@@ -78,7 +78,7 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra
   return supported;
 }
 
-Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info) {
+Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info, /*enable_caches*/ true) {
   const auto& node{Node()};
 
   info.GetAttrOrDefault<float>("alpha", &alpha_, 1.f);
@@ -146,14 +146,9 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
       trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0],  // size_t output_stride,
       B_->Data<float>(),                                           // const float* kernel,
       bias_Data,                                                   // const float* bias,
-      output_min,
-      output_max,
+      output_min, output_max,
       flags,
-#ifdef XNN_CACHE_ENABLE
-      &xnn_caches_,
-#else
-      0,
-#endif
+      GetCodeCache(), GetWeightsCache(),
       &p);
 
   if (status != xnn_status_success) {
@@ -165,20 +160,25 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr,
 }
 
 Status Gemm::Compute(OpKernelContext* context) const {
-  pthreadpool_t t_pool = GetThreadPool();
+  pthreadpool_t threadpool = GetThreadPool();
   const auto* A = context->Input<Tensor>(0);
   auto Y = context->Output(0, {M_, N_});
 
   // if input is empty tensor, return as nothing need to be calculated and we've set the shape for the output
-  if (M_ == 0 || N_ == 0)
+  if (M_ == 0 || N_ == 0) {
     return Status::OK();
+  }
+
+  xnn_status status = xnn_reshape_fully_connected_nc_f32(op0_.get(),
+                                                         // Number of rows to multiply
+                                                         trans_A_ == CblasNoTrans ? M_ : K_,
+                                                         threadpool);
+
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_f32 returned ", status);
+  }
 
-  xnn_status status = xnn_setup_fully_connected_nc_f32(
-      op0_.get(),
-      trans_A_ == CblasNoTrans ? M_ : K_,  // Number of rows to multiply
-      A->Data<float>(),
-      Y->MutableData<float>(),
-      t_pool);
+  status = xnn_setup_fully_connected_nc_f32(op0_.get(), A->Data<float>(), Y->MutableData<float>());
 
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_f32 returned ", status);
@@ -192,7 +192,15 @@ Status Gemm::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 7, 12, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 7, 8, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  Gemm);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 9, 10, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  Gemm);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 11, 12, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                                   Gemm);
 
diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.h b/onnxruntime/core/providers/xnnpack/math/gemm.h
index 9191ba204bc25..6d11a8531c20f 100644
--- a/onnxruntime/core/providers/xnnpack/math/gemm.h
+++ b/onnxruntime/core/providers/xnnpack/math/gemm.h
@@ -41,14 +41,6 @@ class Gemm : protected GemmBase, public XnnpackKernel {
 
   float alpha_;
   float beta_;
-
-#ifdef XNN_CACHE_ENABLE
-#if XNN_PLATFORM_JIT
-  xnn_code_cache code_cache_;
-#endif
-  xnn_caches xnn_caches_ = {0, 0};
-  xnn_weights_cache weights_cache_;
-#endif
 };
 
 }  // namespace xnnpack
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc
index fc7335c79b603..e90aa11c9d087 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.cc
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc
@@ -62,7 +62,7 @@ bool MatMul::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& g
   return supported;
 }
 
-MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info) {}
+MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info, /*enable_caches*/ true) {}
 
 Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                        /*out*/ bool& is_packed,
@@ -99,9 +99,11 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
       output_max,
       flags,
 #ifdef XNN_CACHE_ENABLE
-      &xnn_caches_,
+      GetCodeCache(),
+      GetWeightsCache(),
 #else
-      0,
+      nullptr,
+      nullptr,
 #endif
       &p);
 
@@ -116,7 +118,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
 
 Status MatMul::Compute(OpKernelContext* ctx) const {
   const Tensor* a = ctx->Input<Tensor>(0);
-  pthreadpool_t t_pool = GetThreadPool();
+  pthreadpool_t threadpool = GetThreadPool();
   MatMulComputeHelper helper;
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape_));
   Tensor* y = ctx->Output(0, helper.OutputShape());
@@ -126,13 +128,12 @@ Status MatMul::Compute(OpKernelContext* ctx) const {
 
   auto* y_data = y->MutableData<float>();
 
-  xnn_status status = xnn_setup_fully_connected_nc_f32(
-      op0_.get(),
-      a->Shape()[0],
-      a->Data<float>(),
-      y_data,
-      t_pool);
+  xnn_status status = xnn_reshape_fully_connected_nc_f32(op0_.get(), a->Shape()[0], threadpool);
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_f32 returned ", status);
+  }
 
+  status = xnn_setup_fully_connected_nc_f32(op0_.get(), a->Data<float>(), y_data);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_f32 returned ", status);
   }
@@ -144,7 +145,11 @@ Status MatMul::Compute(OpKernelContext* ctx) const {
   return Status::OK();
 }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 1, 12, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 1, 8, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  MatMul);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 9, 12, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                                   MatMul);
 
diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.h b/onnxruntime/core/providers/xnnpack/math/matmul.h
index f4ed92c6146fb..b76e42c4d3729 100644
--- a/onnxruntime/core/providers/xnnpack/math/matmul.h
+++ b/onnxruntime/core/providers/xnnpack/math/matmul.h
@@ -32,14 +32,6 @@ class MatMul : public XnnpackKernel {
   AllocatorPtr myAlloc;
 
   XnnpackOperator op0_ = nullptr;
-
-#ifdef XNN_CACHE_ENABLE
-#if XNN_PLATFORM_JIT
-  xnn_code_cache code_cache_;
-#endif
-  xnn_caches xnn_caches_ = {0, 0};
-  xnn_weights_cache weights_cache_;
-#endif
 };
 
 }  // namespace xnnpack
diff --git a/onnxruntime/core/providers/xnnpack/nn/softmax.cc b/onnxruntime/core/providers/xnnpack/math/softmax.cc
similarity index 80%
rename from onnxruntime/core/providers/xnnpack/nn/softmax.cc
rename to onnxruntime/core/providers/xnnpack/math/softmax.cc
index bca84317ad891..87440b7814176 100644
--- a/onnxruntime/core/providers/xnnpack/nn/softmax.cc
+++ b/onnxruntime/core/providers/xnnpack/math/softmax.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/xnnpack/nn/softmax.h"
+#include "core/providers/xnnpack/math/softmax.h"
 
 #include <utility>
 
@@ -25,6 +25,7 @@ bool IsQuantSoftmaxSupported(const NodeUnit& node_unit, const GraphViewer& graph
         output_type != TensorTypeUint8) {
       break;
     }
+
     // to ensure its output scale and zp are 1/256 and 0, otherwise xnnpack EP has to do extra requantization
     // idealy, QlinearSoftmax or QDQSoftmax will keep this output scale and zp, but we have to handle some
     // qdq models converted from other framework
@@ -33,6 +34,7 @@ bool IsQuantSoftmaxSupported(const NodeUnit& node_unit, const GraphViewer& graph
     if (fabs(q_scale.DataAsSpan<float>()[0] - 1.0f / 256.0f) > 0.0001f) {
       break;
     }
+
     if (zero_tensor) {
       Initializer q_zp(*zero_tensor, node_unit.ModelPath());
       if (q_zp.DataAsSpan<uint8_t>()[0] != 0) {
@@ -57,6 +59,7 @@ bool Softmax::IsOnnxNodeSupported(const NodeUnit& node_unit,
       IsQuantSoftmaxSupported(node_unit, graph) == false) {
     return false;
   }
+
   // use do {} while(false) so it's easier to set a breakpoint on the return
   do {
     // SoftMax has 1 input.
@@ -133,6 +136,7 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} {
     ORT_ENFORCE(status.IsOK(), "opset must be existed in attributes of QlinearSoftmax");
     opset_ = gsl::narrow_cast<int>(opset);
   }
+
   int64_t axis = -1;
   Status status = info.GetAttr<int64_t>("axis", &axis);
   // our op checker function has ensured that axis must be the last dim
@@ -162,23 +166,22 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} {
   if (op_type_ == OpComputeType::op_compute_type_qu8) {
     // the order of input tensor, x,x_scale, x_zp, y_scale, y_zp
     OpQuantParam quant_param = ParseQuantParamForOp(info, x_dtype, 1);
-    xstatus = xnn_create_softmax_nc_qu8(
-        channels,
-        channels,
-        channels,
-        quant_param[0].first[0],  // x_scale
-        quant_param[1].second,    // y_zp
-        quant_param[1].first[0],  // y_scale
-        0,                        // flags,
-        &p);
+    xstatus = xnn_create_softmax_nc_qu8(channels,
+                                        channels,
+                                        channels,
+                                        quant_param[0].first[0],  // x_scale
+                                        quant_param[1].second,    // y_zp
+                                        quant_param[1].first[0],  // y_scale
+                                        0,                        // flags,
+                                        &p);
   } else if (op_type_ == OpComputeType::op_compute_type_fp32) {
-    xstatus = xnn_create_softmax_nc_f32(
-        channels,
-        channels,
-        channels,
-        0,  // flags,
-        &p);
+    xstatus = xnn_create_softmax_nc_f32(channels,
+                                        channels,
+                                        channels,
+                                        0,  // flags,
+                                        &p);
   }
+
   ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_softmax_nc_",
               OpTypeToString(op_type_), " failed. Status:", xstatus);
   op0_.reset(p);
@@ -194,39 +197,48 @@ Status Softmax::Compute(OpKernelContext* ctx) const {
   if (X_shape.Size() == 0) {
     return Status::OK();
   }
-  pthreadpool_t t_pool = GetThreadPool();
+
+  pthreadpool_t threadpool = GetThreadPool();
   const size_t N = X_shape.SizeToDimension(axis_);
   // const size_t D = X_shape.SizeFromDimension(axis_); // the step D is 1
   xnn_status status = xnn_status_invalid_state;
+
+  auto reshape_fn = op_type_ == OpComputeType::op_compute_type_qu8 ? xnn_reshape_softmax_nc_qu8
+                                                                   : xnn_reshape_softmax_nc_f32;
+  status = reshape_fn(op0_.get(), N, threadpool);
+
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_softmax_nc_", OpTypeToString(op_type_),
+                           " returned ", status);
+  }
+
   if (op_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_softmax_nc_qu8(
-        op0_.get(),
-        N,
-        X->Data<uint8_t>(),
-        Y->MutableData<uint8_t>(),
-        t_pool);
+    status = xnn_setup_softmax_nc_qu8(op0_.get(), X->Data<uint8_t>(), Y->MutableData<uint8_t>());
   } else {
-    status = xnn_setup_softmax_nc_f32(
-        op0_.get(),
-        N,
-        X->Data<float>(),
-        Y->MutableData<float>(),
-        t_pool);
+    status = xnn_setup_softmax_nc_f32(op0_.get(), X->Data<float>(), Y->MutableData<float>());
   }
+
   if (status != xnn_status_success) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_softmax_nc_",
-                           OpTypeToString(op_type_), " returned ", status);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_softmax_nc_", OpTypeToString(op_type_),
+                           " returned ", status);
   }
-  status = xnn_run_operator(op0_.get(), t_pool);
+
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
+
   return Status::OK();
 }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 1, 12, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 1, 10, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                                   Softmax);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 11, 12, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  Softmax);
+
 ONNX_OPERATOR_KERNEL_EX(Softmax, kOnnxDomain, 13, kXnnpackExecutionProvider,
                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                         Softmax);
diff --git a/onnxruntime/core/providers/xnnpack/nn/softmax.h b/onnxruntime/core/providers/xnnpack/math/softmax.h
similarity index 100%
rename from onnxruntime/core/providers/xnnpack/nn/softmax.h
rename to onnxruntime/core/providers/xnnpack/math/softmax.h
diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
index 767218fbfd20b..58c209a13cd0c 100644
--- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc
@@ -2,10 +2,13 @@
 // Licensed under the MIT License.
 #include "core/providers/xnnpack/nn/average_pool.h"
 
+#include <memory>
+
 #include "core/common/status.h"
 #include "core/graph/graph.h"
 #include "core/providers/utils.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 #include "core/providers/xnnpack/detail/utils.h"
 
 namespace onnxruntime {
@@ -90,6 +93,10 @@ bool AveragePool::IsOnnxNodeSupported(const NodeUnit& node_unit,
   const auto& inputs = node_unit.Inputs();
   // use do {} while(false) so it's easier to set a breakpoint on the return
   do {
+    if (node_unit.SinceVersion() < 7) {
+      break;
+    }
+
     // AveragePool has 1 input.
     const auto& x_arg = inputs[0].node_arg;
 
@@ -141,6 +148,11 @@ bool AveragePool::IsOnnxNodeSupported(const NodeUnit& node_unit,
       break;
     }
 
+    // need dilations to all be 1
+    if (!pool_attrs.default_dilations) {
+      break;
+    }
+
     supported = true;
   } while (false);
 
@@ -221,24 +233,47 @@ Status AveragePool::Compute(OpKernelContext* context) const {
     return Status::OK();
   }
 
-  pthreadpool_t t_pool = GetThreadPool();
-  xnn_status status = xnn_status_invalid_state;
+  pthreadpool_t threadpool = GetThreadPool();
+
+  // setup allocator/automated dellocate for workspace
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+  xnn_allocator* allocator = GetStoredAllocator().second;
+  auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); };
+
+  std::unique_ptr<void, decltype(deallocator)> workspace(nullptr, deallocator);
+
+  auto reshape_fn = (avgpool_type_ == OpComputeType::op_compute_type_fp32)
+                        ? xnn_reshape_average_pooling2d_nhwc_f32
+                        : xnn_reshape_average_pooling2d_nhwc_qu8;
+
+  auto status = reshape_fn(op0_.get(), N, H, W,
+                           &workspace_size, &workspace_alignment,
+                           /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+                           threadpool);
+
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_average_pooling2d_nhwc_", OpTypeToString(avgpool_type_),
+                           " returned ", status);
+  }
+
+  workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size));
+
   if (avgpool_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_setup_average_pooling2d_nhwc_f32(op0_.get(), N, H, W,
-                                                  X.Data<float>(), Y.MutableData<float>(),
-                                                  t_pool /*threadpool */);
+    status = xnn_setup_average_pooling2d_nhwc_f32(op0_.get(), workspace.get(),
+                                                  X.Data<float>(), Y.MutableData<float>());
+
   } else if (avgpool_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_average_pooling2d_nhwc_qu8(op0_.get(), N, H, W,
-                                                  X.Data<uint8_t>(), Y.MutableData<uint8_t>(),
-                                                  t_pool /*threadpool */);
+    status = xnn_setup_average_pooling2d_nhwc_qu8(op0_.get(), workspace.get(),
+                                                  X.Data<uint8_t>(), Y.MutableData<uint8_t>());
   }
 
   if (status != xnn_status_success) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_average_pooling2d_nhwc_",
-                           OpTypeToString(avgpool_type_), " returned ", status);
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_average_pooling2d_nhwc_", OpTypeToString(avgpool_type_),
+                           " returned ", status);
   }
 
-  status = xnn_run_operator(op0_.get(), t_pool);
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
@@ -246,8 +281,26 @@ Status AveragePool::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    AveragePool, kMSInternalNHWCDomain, 7, 9,
+    kXnnpackExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    AveragePool);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    AveragePool, kMSInternalNHWCDomain, 10, 10,
+    kXnnpackExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    AveragePool);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    AveragePool, kMSInternalNHWCDomain, 11, 18,
+    kXnnpackExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    AveragePool);
+
 ONNX_OPERATOR_KERNEL_EX(
-    AveragePool, kMSInternalNHWCDomain, 11,
+    AveragePool, kMSInternalNHWCDomain, 19,
     kXnnpackExecutionProvider,
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     AveragePool);
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc
index 0772dec59e30e..0cdb9c840aa2d 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc
@@ -3,12 +3,13 @@
 
 #include "conv.h"
 
+#include "core/common/gsl.h"
 #include "core/common/inlined_containers_fwd.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/framework/transpose_helper.h"
 #include "core/providers/utils.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 #include "core/providers/xnnpack/detail/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/common/gsl.h"
 
 namespace onnxruntime {
 namespace xnnpack {
@@ -64,21 +65,48 @@ Status Conv::Compute(OpKernelContext* context) const {
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
-  pthreadpool_t t_pool = GetThreadPool();
 
-  xnn_status status = xnn_status_invalid_state;
+  pthreadpool_t threadpool = GetThreadPool();
+
+  // setup allocator/automated dellocate for workspace
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+  xnn_allocator* allocator = GetStoredAllocator().second;
+  auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); };
+  std::unique_ptr<void, decltype(deallocator)> workspace(nullptr, deallocator);
+
+  auto reshape_fn = xnn_reshape_convolution2d_nhwc_f32;
+  if (conv_type_ == OpComputeType::op_compute_type_qs8) {
+    reshape_fn = xnn_reshape_convolution2d_nhwc_qs8;
+  } else if (conv_type_ == OpComputeType::op_compute_type_qu8) {
+    reshape_fn = xnn_reshape_convolution2d_nhwc_qu8;
+  } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) {
+    reshape_fn = xnn_reshape_convolution2d_nhwc_qs8_qc8w;
+  }
+
+  auto status = reshape_fn(op0_.get(), N, H, W,
+                           &workspace_size, &workspace_alignment,
+                           /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+                           threadpool);
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_convolution2d_nhwc_", OpTypeToString(conv_type_),
+                           "returned ", status);
+  }
+
+  workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size));
+
   if (conv_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), N, H, W, X.Data<float>(), Y->MutableData<float>(),
-                                              t_pool /*threadpool*/);
+    status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), workspace.get(), X.Data<float>(),
+                                              Y->MutableData<float>());
   } else if (conv_type_ == OpComputeType::op_compute_type_qs8) {
-    status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), N, H, W, X.Data<int8_t>(), Y->MutableData<int8_t>(),
-                                              t_pool /*threadpool*/);
+    status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), workspace.get(), X.Data<int8_t>(),
+                                              Y->MutableData<int8_t>());
   } else if (conv_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), N, H, W, X.Data<uint8_t>(), Y->MutableData<uint8_t>(),
-                                              t_pool /*threadpool*/);
+    status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), workspace.get(), X.Data<uint8_t>(),
+                                              Y->MutableData<uint8_t>());
   } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) {
-    status = xnn_setup_convolution2d_nhwc_qc8(op0_.get(), N, H, W, X.Data<int8_t>(), Y->MutableData<int8_t>(),
-                                              t_pool /*threadpool*/);
+    status = xnn_setup_convolution2d_nhwc_qs8_qc8w(op0_.get(), workspace.get(), X.Data<int8_t>(),
+                                                   Y->MutableData<int8_t>());
   }
 
   if (status != xnn_status_success) {
@@ -86,7 +114,7 @@ Status Conv::Compute(OpKernelContext* context) const {
                            OpTypeToString(conv_type_), "returned ", status);
   }
 
-  status = xnn_run_operator(op0_.get(), t_pool);
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
@@ -94,6 +122,10 @@ Status Conv::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+                                  Conv);
+
 ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider,
                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
                         Conv);
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
index b692f373ff4ce..d21014569234e 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc
@@ -23,7 +23,8 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
                            const std::optional<std::pair<float, float>>& clip_min_max,
                            const Tensor& Weight, const Tensor* Bias,
                            XnnpackOperator& op_uptr,
-                           xnn_caches_t caches_t,
+                           xnn_code_cache_t code_cache,
+                           xnn_weights_cache_t weights_cache,
                            const OpQuantParam& quant_param,
                            OpComputeType conv_type,
                            bool is_transpose = false) {
@@ -75,7 +76,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
         C, M,                                         // input channel stride, output channel stride
         Weight.Data<float>(), B_data,
         foutput_min, foutput_max, flags,
-        caches_t,
+        code_cache, weights_cache,
         &p);
   } else if (conv_type == OpComputeType::op_compute_type_qs8) {
     const float output_scale = quant_param[2].first[0];
@@ -99,7 +100,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
         quant_param[2].second, quant_param[2].first[0],
         output_min, output_max,
         flags,
-        caches_t,
+        code_cache, weights_cache,
         &p);
   } else if (conv_type == OpComputeType::op_compute_type_qs8_per_channel) {
     auto* B_data = Bias ? Bias->Data<int32_t>() : nullptr;
@@ -107,7 +108,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
     const int8_t output_zero_point = quant_param[2].second;
     const int8_t output_min = xnn_u8s8_quantize<int8_t>(foutput_min, output_scale, output_zero_point);
     const int8_t output_max = xnn_u8s8_quantize<int8_t>(foutput_max, output_scale, output_zero_point);
-    status = xnn_create_convolution2d_nhwc_qc8(
+    status = xnn_create_convolution2d_nhwc_qs8_qc8w(
         input_padding_top, input_padding_right, input_padding_bottom, input_padding_left,
         kernel_height, kernel_width,
         subsampling_height, subsampling_width,
@@ -123,7 +124,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
         quant_param[2].second, quant_param[2].first[0],
         output_min, output_max,
         flags,
-        caches_t,
+        code_cache, weights_cache,
         &p);
   } else if (conv_type == OpComputeType::op_compute_type_qu8) {
     const auto* B_data = Bias ? Bias->Data<int32_t>() : nullptr;
@@ -148,15 +149,17 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr,
         quant_param[2].second, quant_param[2].first[0],
         output_min, output_max,
         flags,
-        caches_t,
+        code_cache, weights_cache,
         &p);
   }
+
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
                            "Failed to create xnnpack kernel. xnn_create_",
                            is_transpose ? "deconvolution2d" : "convolution2d", "_nhwc_",
                            OpTypeToString(conv_type), " returned ", status);
   }
+
   op_uptr.reset(p);
   return Status::OK();
 }
@@ -296,6 +299,11 @@ bool ConvBase::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer&
   const onnxruntime::Node& node = node_unit.GetNode();
   // use do {} while(false) so it's easier to set a breakpoint on the return
   do {
+    // Internal NHWC domain starts at opset 11
+    if (node_unit.SinceVersion() < 11) {
+      break;
+    }
+
     // Conv has at least 2 inputs.
     const auto& inputs = node_unit.Inputs();
     const auto& x_arg = inputs[0].node_arg;
@@ -367,7 +375,7 @@ bool ConvBase::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer&
 }
 
 ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose)
-    : XnnpackKernel(info),
+    : XnnpackKernel(info, /*enable_caches*/ true),
       conv_attrs_(info),
       conv_transpose_attrs_(info),
       convbase_attrs_ref_(is_transpose ? conv_transpose_attrs_ : conv_attrs_),
@@ -383,16 +391,7 @@ ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose)
       }
     }
   }
-  // xnnpack cache_code, unfortunately these definitions are only available in xnnpack/cache.h,
-#ifdef XNN_CACHE_ENABLE
-#if XNN_PLATFORM_JIT
-  xnn_init_code_cache(&code_cache_);
-  xnn_caches_.code_cache = &code_cache_;
-#endif
-  // TODO(Jicwen) enable weight-cache and code-cache
-  xnn_init_weights_cache(&weights_cache_);
-  xnn_caches_.weights_cache = &weights_cache_;
-#endif
+
   const auto& node{Node()};
   const auto& input_defs = node.InputDefs();
   const NodeArg& X = *input_defs[0];
@@ -477,11 +476,7 @@ ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose)
 Status ConvBase::CreateKernel() {
   auto ret = CreateXnnpackKernel(&convbase_attrs_ref_, C_, M_, kernel_shape_, clip_min_max_, packed_w_,
                                  B_, op0_,
-#ifdef XNN_CACHE_ENABLE
-                                 &xnn_caches_,
-#else
-                                 0,
-#endif
+                                 GetCodeCache(), GetWeightsCache(),
                                  quant_param_, conv_type_, is_transpose_);
   return ret;
 }
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.h b/onnxruntime/core/providers/xnnpack/nn/conv_base.h
index d3501a56ea24c..53ad51378c6be 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_base.h
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.h
@@ -39,14 +39,6 @@ class ConvBase : public XnnpackKernel {
   std::optional<std::pair<float, float>> clip_min_max_;
 
   XnnpackOperator op0_ = nullptr;
-  // we can't have the definition here because we can't import xnnpack/cache.h
-#ifdef XNN_CACHE_ENABLE
-#if XNN_PLATFORM_JIT
-  xnn_code_cache code_cache_;
-#endif
-  xnn_caches xnn_caches_ = {0, 0};
-  xnn_weights_cache weights_cache_;
-#endif
   OpQuantParam quant_param_;
   OpComputeType conv_type_ = OpComputeType::op_compute_type_invalid;
 };
diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
index 61d8f7f488547..8698c0739509d 100644
--- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc
@@ -81,29 +81,34 @@ Status ConvTranspose::Compute(OpKernelContext* context) const {
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
-  pthreadpool_t t_pool = GetThreadPool();
+  pthreadpool_t threadpool = GetThreadPool();
 
   auto output_pad_0 = gsl::narrow_cast<uint32_t>(conv_transpose_attrs_.output_padding[0]);
   auto output_pad_1 = gsl::narrow_cast<uint32_t>(conv_transpose_attrs_.output_padding[1]);
   xnn_status status = xnn_status_invalid_state;
+
+  auto reshape_fn = xnn_reshape_deconvolution2d_nhwc_f32;
+  if (conv_type_ == OpComputeType::op_compute_type_qs8) {
+    reshape_fn = xnn_reshape_deconvolution2d_nhwc_qs8;
+  } else if (conv_type_ == OpComputeType::op_compute_type_qu8) {
+    reshape_fn = xnn_reshape_deconvolution2d_nhwc_qu8;
+  }
+
+  status = reshape_fn(op0_.get(), N, H, W, output_pad_0, output_pad_1,
+                      /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+                      threadpool);
+
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_deconvolution2d_nhwc_",
+                           OpTypeToString(conv_type_), " returned ", status);
+  }
+
   if (conv_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_setup_deconvolution2d_nhwc_f32(
-        op0_.get(), N, H, W,
-        output_pad_0,
-        output_pad_1, X.Data<float>(), Y->MutableData<float>(),
-        t_pool /*threadpool*/);
+    status = xnn_setup_deconvolution2d_nhwc_f32(op0_.get(), X.Data<float>(), Y->MutableData<float>());
   } else if (conv_type_ == OpComputeType::op_compute_type_qs8) {
-    status = xnn_setup_deconvolution2d_nhwc_qs8(
-        op0_.get(), N, H, W,
-        output_pad_0,
-        output_pad_1, X.Data<int8_t>(), Y->MutableData<int8_t>(),
-        t_pool /*threadpool*/);
+    status = xnn_setup_deconvolution2d_nhwc_qs8(op0_.get(), X.Data<int8_t>(), Y->MutableData<int8_t>());
   } else if (conv_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_deconvolution2d_nhwc_qu8(
-        op0_.get(), N, H, W,
-        output_pad_0,
-        output_pad_1, X.Data<uint8_t>(), Y->MutableData<uint8_t>(),
-        t_pool /*threadpool*/);
+    status = xnn_setup_deconvolution2d_nhwc_qu8(op0_.get(), X.Data<uint8_t>(), Y->MutableData<uint8_t>());
   }
 
   if (status != xnn_status_success) {
@@ -111,7 +116,7 @@ Status ConvTranspose::Compute(OpKernelContext* context) const {
                            OpTypeToString(conv_type_), " returned ", status);
   }
 
-  status = xnn_run_operator(op0_.get(), t_pool);
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index de6dd68bba9c3..2ef9f97f77b14 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -41,6 +41,10 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit,
   const onnxruntime::Node& node = node_unit.GetNode();
   // use do {} while(false) so it's easier to set a breakpoint on the return
   do {
+    if (node_unit.SinceVersion() < 8) {
+      break;
+    }
+
     // MaxPool has 1 input.
     auto input_defs = node.InputDefs();
     const auto& x_arg = *input_defs[0];
@@ -220,20 +224,29 @@ Status MaxPool::Compute(OpKernelContext* context) const {
     return Status::OK();
   }
 
-  pthreadpool_t t_pool = GetThreadPool();
-  xnn_status status = xnn_status_invalid_state;
+  pthreadpool_t threadpool = GetThreadPool();
+
+  auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32;
+  if (maxpool_type_ == OpComputeType::op_compute_type_qu8)
+    reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8;
+  else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
+    reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8;
+  }
+
+  auto status = reshape_fn(op0_.get(), N, H, W,
+                           /*output_height_out=*/nullptr, /*output_width_out=*/nullptr,
+                           threadpool);
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_max_pooling2d_nhwc_",
+                           OpTypeToString(maxpool_type_), " returned ", status);
+  }
+
   if (maxpool_type_ == OpComputeType::op_compute_type_fp32) {
-    status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), N, H, W,
-                                              X.Data<float>(), Y->MutableData<float>(),
-                                              t_pool /*threadpool */);
+    status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data<float>(), Y->MutableData<float>());
   } else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), N, H, W,
-                                             X.Data<uint8_t>(), Y->MutableData<uint8_t>(),
-                                             t_pool /*threadpool */);
+    status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data<uint8_t>(), Y->MutableData<uint8_t>());
   } else {
-    status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), N, H, W,
-                                             X.Data<int8_t>(), Y->MutableData<int8_t>(),
-                                             t_pool /*threadpool */);
+    status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data<int8_t>(), Y->MutableData<int8_t>());
   }
 
   if (status != xnn_status_success) {
@@ -241,7 +254,7 @@ Status MaxPool::Compute(OpKernelContext* context) const {
                            OpTypeToString(maxpool_type_), " returned ", status);
   }
 
-  status = xnn_run_operator(op0_.get(), t_pool);
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
@@ -249,12 +262,24 @@ Status MaxPool::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(
-    MaxPool, kMSInternalNHWCDomain, 11, 11, kXnnpackExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
-                                            DataTypeImpl::GetTensorType<uint8_t>(),
-                                            DataTypeImpl::GetTensorType<int8_t>()}),
-    MaxPool);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<uint8_t>(),
+                                                                          DataTypeImpl::GetTensorType<int8_t>()}),
+                                  MaxPool);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<uint8_t>(),
+                                                                          DataTypeImpl::GetTensorType<int8_t>()}),
+                                  MaxPool);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, kXnnpackExecutionProvider,
+                                  KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                                                                          DataTypeImpl::GetTensorType<uint8_t>(),
+                                                                          DataTypeImpl::GetTensorType<int8_t>()}),
+                                  MaxPool);
+
 ONNX_OPERATOR_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, kXnnpackExecutionProvider,
                         KernelDefBuilder()
                             .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
diff --git a/onnxruntime/core/providers/xnnpack/nn/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
similarity index 67%
rename from onnxruntime/core/providers/xnnpack/nn/resize.cc
rename to onnxruntime/core/providers/xnnpack/tensor/resize.cc
index 76c6b6acbfe32..0c9e2e9fc17a2 100644
--- a/onnxruntime/core/providers/xnnpack/nn/resize.cc
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/xnnpack/nn/resize.h"
+#include "core/providers/xnnpack/tensor/resize.h"
 
 #include <algorithm>
 #include <utility>
@@ -10,6 +10,7 @@
 #include "core/common/inlined_containers_fwd.h"
 #include "core/framework/op_kernel.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 
 namespace onnxruntime {
 namespace xnnpack {
@@ -18,26 +19,67 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
                                  const GraphViewer& graph_viewer) {
   bool supported = false;
   do {
+    if (node_unit.SinceVersion() < 10) {
+      break;
+    }
+
     // Resize has 1-4 input.
     const auto& inputs = node_unit.Inputs();
     const auto& x_arg = inputs[0].node_arg;
 
     const auto* x_type = x_arg.TypeAsProto();
-    if (x_type == nullptr ||
-        (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-         x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
-         x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
+    if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+                              x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
+                              x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
       break;
     }
 
     const auto* x_shape = x_arg.Shape();
-    //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 (NCHW) or
-    // 4-D input with outermost and innermost scales as 1 (NHWC)
-    // but we just support 4-d tensor for now, and the channel must be known.
+
+    // 'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 (NCHW) can be supported.
+    // we only support 4-d tensor for now, and the channel must be known.
+    // we assume the input in NCHW for this test.
     if (!x_shape || x_shape->dim_size() != 4 || x_shape->dim(1).dim_value() <= 0) {
       break;
     }
 
+    // validate it is in fact NCHW
+    //
+    // opset 10 had `scales` as input 1 and no sizes. later opsets added roi as input 1 followed by scales and sizes.
+    auto opset_version = node_unit.SinceVersion();
+    size_t scale_idx = opset_version == 10 ? 1 : 2;
+    size_t size_idx = 3;
+
+    // onnx shape inferencing validates that one and not both of sizes and scales are provided
+    const auto* scale_tensor = inputs.size() >= scale_idx + 1
+                                   ? graph_viewer.GetConstantInitializer(inputs[scale_idx].node_arg.Name(), true)
+                                   : nullptr;
+    const auto* size_tensor = opset_version > 10 && inputs.size() >= size_idx + 1
+                                  ? graph_viewer.GetConstantInitializer(inputs[size_idx].node_arg.Name(), true)
+                                  : nullptr;
+
+    // if both scales and sizes are nullptr the one that was provided was not a constant initializer
+    if (!scale_tensor && !size_tensor) {
+      break;
+    }
+
+    // check the scale for the second dim is 1 or the size of the second dim matches the input shape.
+    // if not, it is not the C dim as a Resize will not change the number of channels.
+    InlinedVector<float> scale(4, 1.0F);
+    if (scale_tensor) {
+      const Initializer scale_val(*scale_tensor, node_unit.ModelPath());
+      if (scale_val.DataAsSpan<float>()[1] != 1.0F) {
+        break;
+      }
+    }
+
+    if (size_tensor) {
+      const Initializer size_val(*size_tensor, node_unit.ModelPath());
+      if (size_val.DataAsSpan<int64_t>()[1] != x_shape->dim(1).dim_value()) {
+        break;
+      }
+    }
+
     const auto* output_shape = node_unit.Outputs()[0].node_arg.Shape();
     bool length_resized_compatible_pytorch_half_pixel = true;
     // when length_resized > 1, there is no difference between pytorch_half_pixel and half_pixel
@@ -48,18 +90,11 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
     // if coordinate_transformation_mode is "pytorch_half_pixel",
     // x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0
     //
-    if (output_shape->dim(2).dim_value() <= 1 || output_shape->dim(1).dim_value() <= 1) {
+    if (output_shape->dim(2).dim_value() <= 1 || output_shape->dim(3).dim_value() <= 1) {
+      // we don't know the output H or W so we don't know if it will be compatible
       length_resized_compatible_pytorch_half_pixel = false;
     }
 
-    // Refer to onnxruntime/core/providers/cpu/tensor/upsamplebase.h,
-    size_t scale_idx = 2;
-    size_t size_idx = 3;
-    auto opset_version = node_unit.SinceVersion();
-    if (opset_version == 10) {
-      scale_idx = 1;
-    }
-
     ProtoHelperNodeContext nc(node_unit.GetNode());
     OpNodeProtoHelper info(&nc);
 
@@ -78,6 +113,7 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
 
     std::vector<int64_t> axes;
     if (info.GetAttrs<int64_t>("axes", axes).IsOK() && axes.size() > 0) {
+      // TODO: We should be able to handle this if required
       break;
     }
 
@@ -95,9 +131,10 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
     // Coordinate transformation mode attr was introduced in version 11.
     // before that asymmetric mode was the only available transformation mode
     std::string coordinate_transform_mode_name =
-        opset_version > 10
-            ? info.GetAttrOrDefault<std::string>("coordinate_transformation_mode", "half_pixel")
-            : "asymmetric";
+        opset_version > 10 ? info.GetAttrOrDefault<std::string>("coordinate_transformation_mode", "half_pixel")
+                           : "asymmetric";
+
+    // TODO: Opset 19 added half_pixel_symmetric. Need to see if that can be supported.
 
     if (coordinate_transform_mode_name != "asymmetric" &&
         coordinate_transform_mode_name != "half_pixel" &&
@@ -106,59 +143,7 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit,
       break;
     }
 
-    auto exclude_outside = info.GetAttrOrDefault<int64_t>("exclude_outside", 0) == 0 ? false : true;
-    if (exclude_outside) {
-      break;
-    }
-
-    // roi  only takes effect when coordinate_transformation_mode is "tf_crop_and_resize"
-
-    // size or scales shouldnt't be provided in the same time but should at least be provided one of them
-    const auto* scale_tensor = inputs.size() >= scale_idx + 1
-                                   ? graph_viewer.GetConstantInitializer(inputs[scale_idx].node_arg.Name(), true)
-                                   : nullptr;
-    const auto* size_tensor = inputs.size() >= size_idx + 1
-                                  ? graph_viewer.GetConstantInitializer(inputs[size_idx].node_arg.Name(), true)
-                                  : nullptr;
-
-    bool has_size = false;
-    bool has_scale = false;
-    InlinedVector<float> scale(4, 1.0F);
-    if (scale_tensor) {
-      const Initializer scale_val(*scale_tensor, node_unit.ModelPath());
-      auto scale_span = scale_val.DataAsSpan<float>();
-      if (scale_span.size() == 4) {
-        has_scale = true;
-        std::copy(scale_span.begin(), scale_span.end(), scale.begin());
-      }
-    }
-
-    if (size_tensor) {
-      auto input_shape = utils::GetTensorShapeFromTensorShapeProto(*x_shape);
-      const Initializer size_val(*size_tensor, node_unit.ModelPath());
-
-      auto size_span = size_val.DataAsSpan<int64_t>();
-      if (size_span.size() == 4) {
-        has_size = true;
-        scale = {size_span[0] / static_cast<float>(input_shape[0]),
-                 size_span[1] / static_cast<float>(input_shape[1]),
-                 size_span[2] / static_cast<float>(input_shape[2]),
-                 size_span[3] / static_cast<float>(input_shape[3])};
-      }
-    }
-
-    if ((has_size && has_scale) || (!has_size && !has_scale)) {
-      break;
-    }
-
-    if (scale[0] != 1.0F || (scale[1] != 1.0F && scale[3] != 1.0F)) {
-      break;
-    }
-
-    // only support xnn_create_resize_bilinear2d_nchw_f32
-    const bool is_NHWC = scale[3] == 1.0F;
-    if (!is_NHWC && (x_type->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
-                     x_type->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
+    if (info.GetAttrOrDefault<int64_t>("exclude_outside", 0) != 0) {
       break;
     }
 
@@ -210,8 +195,7 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf
     }
   }
 
-  is_NHWC_ = scales_[3] == 1.0F;
-  int64_t channels = x_shape->dim(is_NHWC_ ? 3 : 1).dim_value();
+  int64_t channels = x_shape->dim(3).dim_value();
 
   uint32_t flags = 0;
   ORT_ENFORCE(mode_ == UpsampleMode::LINEAR, "only support bilinear resize");
@@ -225,18 +209,16 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf
   xnn_status xstatus = xnn_status_invalid_state;
   struct xnn_operator* p = nullptr;
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
-    auto create_func = is_NHWC_ ? xnn_create_resize_bilinear2d_nhwc_f32 : xnn_create_resize_bilinear2d_nchw_f32;
-    xstatus = create_func(
-        channels, channels, channels, flags, &p);
+    xstatus = xnn_create_resize_bilinear2d_nhwc_f32(channels, channels, channels, flags, &p);
   } else if (op_type_ == OpComputeType::op_compute_type_qu8) {
-    xstatus = xnn_create_resize_bilinear2d_nhwc_u8(
-        channels, channels, channels, flags, &p);
+    xstatus = xnn_create_resize_bilinear2d_nhwc_u8(channels, channels, channels, flags, &p);
   } else {
-    xstatus = xnn_create_resize_bilinear2d_nhwc_s8(
-        channels, channels, channels, flags, &p);
+    xstatus = xnn_create_resize_bilinear2d_nhwc_s8(channels, channels, channels, flags, &p);
   }
-  ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_resize_bilinear2d_nhwc_",
-              OpTypeToString(op_type_), " failed. Status:", xstatus);
+
+  ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), " failed. Status:",
+              xstatus);
+
   op0_.reset(p);
 }
 
@@ -245,48 +227,56 @@ Status Resize::ComputeInternal(OpKernelContext* ctx, const Tensor* input,
                                const TensorShapeVector& output_dims) const {
   const auto& X_shape = input->Shape();
   auto N = X_shape[0];
-  auto H = is_NHWC_ ? X_shape[1] : X_shape[2];
-  auto W = is_NHWC_ ? X_shape[2] : X_shape[3];
+  auto H = X_shape[1];
+  auto W = X_shape[2];
   Tensor* output = ctx->Output(0, TensorShape(output_dims));
 
-  pthreadpool_t t_pool = GetThreadPool();
-  xnn_status status = xnn_status_invalid_state;
+  pthreadpool_t threadpool = GetThreadPool();
+
+  // setup allocator/automated dellocate for workspace
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+  xnn_allocator* allocator = GetStoredAllocator().second;
+  auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); };
+  std::unique_ptr<void, decltype(deallocator)> workspace(nullptr, deallocator);
+
+  auto reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_f32;
+  if (op_type_ == OpComputeType::op_compute_type_qu8) {
+    reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_u8;
+  } else if (op_type_ == OpComputeType::op_compute_type_qs8) {
+    reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_s8;
+  }
+
+  auto status = reshape_fn(op0_.get(), N, H, W, output_dims[1], output_dims[2],
+                           &workspace_size, &workspace_alignment, threadpool);
+  if (status != xnn_status_success) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_resize_bilinear2d_nhwc_", OpTypeToString(op_type_),
+                           " returned ", status);
+  }
+
+  workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size));
+
   if (op_type_ == OpComputeType::op_compute_type_fp32) {
-    auto oH = is_NHWC_ ? output_dims[1] : output_dims[2];
-    auto oW = is_NHWC_ ? output_dims[2] : output_dims[3];
-    auto setup_func = is_NHWC_ ? xnn_setup_resize_bilinear2d_nhwc_f32 : xnn_setup_resize_bilinear2d_nchw_f32;
-    status = setup_func(
-        op0_.get(),
-        N,
-        H, W, oH, oW,
-        input->Data<float>(),
-        output->MutableData<float>(),
-        t_pool);
+    status = xnn_setup_resize_bilinear2d_nhwc_f32(op0_.get(), workspace.get(), input->Data<float>(),
+                                                  output->MutableData<float>());
   } else if (op_type_ == OpComputeType::op_compute_type_qu8) {
-    status = xnn_setup_resize_bilinear2d_nhwc_u8(
-        op0_.get(),
-        N,
-        H, W, output_dims[1], output_dims[2],
-        input->Data<uint8_t>(),
-        output->MutableData<uint8_t>(),
-        t_pool);
+    status = xnn_setup_resize_bilinear2d_nhwc_u8(op0_.get(), workspace.get(), input->Data<uint8_t>(),
+                                                 output->MutableData<uint8_t>());
   } else {
-    status = xnn_setup_resize_bilinear2d_nhwc_s8(
-        op0_.get(),
-        N,
-        H, W, output_dims[1], output_dims[2],
-        input->Data<int8_t>(),
-        output->MutableData<int8_t>(),
-        t_pool);
+    status = xnn_setup_resize_bilinear2d_nhwc_s8(op0_.get(), workspace.get(), input->Data<int8_t>(),
+                                                 output->MutableData<int8_t>());
   }
+
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_resize_bilinear2d_nhwc_",
                            OpTypeToString(op_type_), " returned ", status);
   }
-  status = xnn_run_operator(op0_.get(), t_pool);
+
+  status = xnn_run_operator(op0_.get(), threadpool);
   if (status != xnn_status_success) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status);
   }
+
   return Status::OK();
 }
 
@@ -315,29 +305,29 @@ Status Resize::Compute(OpKernelContext* ctx) const {
   return ComputeInternal(ctx, X, output_shape);
 }
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 10, 10, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 10, 10, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
                                                                           DataTypeImpl::GetTensorType<uint8_t>(),
                                                                           DataTypeImpl::GetTensorType<int8_t>()}),
                                   Resize);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 11, 12, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 11, 12, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
                                                                            DataTypeImpl::GetTensorType<uint8_t>(),
                                                                            DataTypeImpl::GetTensorType<int8_t>()}),
                                   Resize);
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 13, 17, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 13, 17, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
                                                                            DataTypeImpl::GetTensorType<uint8_t>(),
                                                                            DataTypeImpl::GetTensorType<int8_t>()}),
                                   Resize);
 
-ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 18, 18, kXnnpackExecutionProvider,
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 18, 18, kXnnpackExecutionProvider,
                                   KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
                                                                            DataTypeImpl::GetTensorType<uint8_t>(),
                                                                            DataTypeImpl::GetTensorType<int8_t>()}),
                                   Resize);
 
-ONNX_OPERATOR_KERNEL_EX(Resize, kOnnxDomain, 19, kXnnpackExecutionProvider,
+ONNX_OPERATOR_KERNEL_EX(Resize, kMSInternalNHWCDomain, 19, kXnnpackExecutionProvider,
                         KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType<float>(),
                                                                  DataTypeImpl::GetTensorType<uint8_t>(),
                                                                  DataTypeImpl::GetTensorType<int8_t>()}),
diff --git a/onnxruntime/core/providers/xnnpack/nn/resize.h b/onnxruntime/core/providers/xnnpack/tensor/resize.h
similarity index 98%
rename from onnxruntime/core/providers/xnnpack/nn/resize.h
rename to onnxruntime/core/providers/xnnpack/tensor/resize.h
index 4975510ee7db4..06ff1bdb61f59 100644
--- a/onnxruntime/core/providers/xnnpack/nn/resize.h
+++ b/onnxruntime/core/providers/xnnpack/tensor/resize.h
@@ -31,7 +31,6 @@ class Resize : public UpsampleBase, public XnnpackKernel {
                          const TensorShapeVector& output_dims) const;
 
  private:
-  bool is_NHWC_;
   XnnpackOperator op0_;
   TensorShapeVector output_dims_;
   OpComputeType op_type_ = OpComputeType::op_compute_type_invalid;
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
index 494c718cde081..a2a776df439e4 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -27,88 +27,117 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
   return info;
 }
 
-#define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op) \
-  BuildKernelCreateInfo<                             \
-      ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, End, Op)>
+#define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op, Domain) \
+  BuildKernelCreateInfo<                                     \
+      ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)>
 
-#define KERNEL_CREATE_INFO(Start, Op) \
-  BuildKernelCreateInfo<              \
-      ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, Op)>
+#define KERNEL_CREATE_INFO(Start, Op, Domain) \
+  BuildKernelCreateInfo<                      \
+      ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)>
 
-#define KERNEL_CREATE_INFO_TYPED(Start, type, Op) \
-  BuildKernelCreateInfo<                          \
-      ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, type, Op)>
+#define KERNEL_CREATE_INFO_TYPED(Start, Type, Op, Domain) \
+  BuildKernelCreateInfo<                                  \
+      ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)>
 
+// Layout sensitive operators in NHWC domain
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 18, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, AveragePool);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, Conv);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose);
+
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearConvTranspose);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 10, 10, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, 17, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 18, 18, Resize);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 19, Resize);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, Softmax);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, uint8_t, QLinearConv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, int8_t, QLinearConv);
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearConvTranspose);
+
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearAveragePool);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider,
-                                      kDynamicDomainByCreate, 1, QLinearSoftmax);
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 12, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, Resize);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
+
+// ONNX operators
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 10, Gemm);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Gemm);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Gemm);
 
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, MatMul);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 8, MatMul);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, MatMul);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 10, Softmax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Softmax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax);
+
+// Internal domain
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
 
-      KERNEL_CREATE_INFO(11, Conv),
-      KERNEL_CREATE_INFO(11, ConvTranspose),
-      KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose),
-      KERNEL_CREATE_INFO(1, QLinearConvTranspose),
-      KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool),
-      KERNEL_CREATE_INFO(12, MaxPool),
-      KERNEL_CREATE_INFO(11, AveragePool),
+      // layout sensitive. nodes will be moved to kMSInternalNHWCDomain by layout transformation
+      KERNEL_CREATE_INFO_VERSIONED(7, 9, AveragePool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(10, 10, AveragePool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(11, 18, AveragePool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO(19, AveragePool, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(1, 10, Conv, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO(11, Conv, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO(11, ConvTranspose, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(8, 9, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(11, 12, Resize, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(13, 17, Resize, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED(18, 18, Resize, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO(19, Resize, kMSInternalNHWCDomain),
+
       // layout insensitive, use ONNX-domain directly
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, Softmax)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 19, Resize)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 18, 18, Resize)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, 17, Resize)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Resize)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 10, 10, Resize)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 12, Gemm)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Gemm)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, MatMul)>,
+      KERNEL_CREATE_INFO_VERSIONED(1, 10, Softmax, kOnnxDomain),
+      KERNEL_CREATE_INFO_VERSIONED(11, 12, Softmax, kOnnxDomain),
+      KERNEL_CREATE_INFO(13, Softmax, kOnnxDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(7, 8, Gemm, kOnnxDomain),
+      KERNEL_CREATE_INFO_VERSIONED(9, 10, Gemm, kOnnxDomain),
+      KERNEL_CREATE_INFO_VERSIONED(11, 12, Gemm, kOnnxDomain),
+      KERNEL_CREATE_INFO(13, Gemm, kOnnxDomain),
+
+      KERNEL_CREATE_INFO_VERSIONED(1, 8, MatMul, kOnnxDomain),
+      KERNEL_CREATE_INFO_VERSIONED(9, 12, MatMul, kOnnxDomain),
+      KERNEL_CREATE_INFO(13, MatMul, kOnnxDomain),
 
       //  quantization op
-      KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv),
-      KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv),
-      KERNEL_CREATE_INFO(1, QLinearAveragePool),
-      BuildKernelCreateInfo<
-          ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax)>,
+      KERNEL_CREATE_INFO(1, QLinearAveragePool, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv, kMSInternalNHWCDomain),
+
+      KERNEL_CREATE_INFO(1, QLinearSoftmax, kDynamicDomainByCreate),
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.cc b/onnxruntime/core/providers/xnnpack/xnnpack_init.cc
index 27634a8b7090c..c3aa1d987c194 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_init.cc
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.cc
@@ -26,13 +26,15 @@ void xnn_deallocate(void* context, void* pointer) {
 }
 
 void* xnn_aligned_allocate(void* context, size_t alignment, size_t size) {
+  if (size == 0)
+    return nullptr;
+
 #if defined(__wasm__) && !defined(__wasm_relaxed_simd__) && !defined(__wasm_simd128__)
   ORT_ENFORCE(alignment <= 2 * sizeof(void*));
   return xnn_allocate(context, size);
 #else
   void* ptr = xnn_allocate(context, size);
-  ORT_ENFORCE((int64_t(ptr) & (alignment - 1)) == 0,
-              " xnnpack wants to allocate a space with ", alignment, "bytes aligned. But it's not satisfied");
+  ORT_ENFORCE((int64_t(ptr) & (alignment - 1)) == 0, "xnnpack allocation was not aligned to ", alignment, " bytes.");
   // if ptr is not aligned, we have to find a way to return a aligned ptr and store the original ptr
   return ptr;
 #endif
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h
index d309edd0c3a4e..a1e64bf6046b2 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h
@@ -5,6 +5,47 @@ struct xnn_allocator;
 namespace onnxruntime {
 namespace xnnpack {
 
+// copy #define logic from XNNPACK src/xnnpack/common.h to determine workspace alignment
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86)
+#define XNN_ARCH_X86 1
+#else
+#define XNN_ARCH_X86 0
+#endif
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) && !defined(_M_ARM64EC)
+#define XNN_ARCH_X86_64 1
+#else
+#define XNN_ARCH_X86_64 0
+#endif
+
+#if defined(__wasm__) && !defined(__wasm_relaxed_simd__) && !defined(__wasm_simd128__)
+#define XNN_ARCH_WASM 1
+#else
+#define XNN_ARCH_WASM 0
+#endif
+
+#if defined(__ANDROID__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
+#define XNN_PLATFORM_MOBILE 1
+#else
+#define XNN_PLATFORM_MOBILE 0
+#endif
+
+#if XNN_ARCH_WASM
+#define XNN_ALLOCATION_ALIGNMENT 4
+#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
+#if XNN_PLATFORM_MOBILE
+#define XNN_ALLOCATION_ALIGNMENT 32
+#else
+#define XNN_ALLOCATION_ALIGNMENT 64
+#endif
+#else
+#define XNN_ALLOCATION_ALIGNMENT 16
+#endif
+
 std::pair<AllocatorPtr&, xnn_allocator*> GetStoredAllocator();
 
 }  // namespace xnnpack
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h
index ada39c767f7c6..0978a88288114 100644
--- a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h
+++ b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h
@@ -4,6 +4,7 @@
 #pragma once
 #include "core/framework/op_kernel.h"
 #include "core/providers/xnnpack/xnnpack_execution_provider.h"
+#include "xnnpack.h"
 
 struct pthreadpool;
 
@@ -12,18 +13,59 @@ namespace xnnpack {
 
 class XnnpackKernel : public OpKernel {
  public:
-  explicit XnnpackKernel(const OpKernelInfo& info)
-      : OpKernel(info),
-        xnnpack_threadpool_(
-            static_cast<const XnnpackExecutionProvider*>(info.GetExecutionProvider())
-                ->GetPrivateThreadPool()) {
+  explicit XnnpackKernel(const OpKernelInfo& info, bool enable_caches = false)
+      : OpKernel{info},
+        xnnpack_threadpool_{
+            static_cast<const XnnpackExecutionProvider*>(info.GetExecutionProvider())->GetPrivateThreadPool()},
+        caches_{enable_caches} {
   }
   [[nodiscard]] pthreadpool* GetThreadPool() const {
     return xnnpack_threadpool_;
   }
 
+  // see comment below about enabling code cache
+  // xnn_code_cache_t GetCodeCache() { return caches_.auto_code_cache.get();}
+  xnn_code_cache_t GetCodeCache() { return nullptr; }
+  xnn_weights_cache_t GetWeightsCache() { return caches_.auto_weights_cache.get(); }
+
  private:
   pthreadpool* xnnpack_threadpool_;
+
+  // Helper class to wrap usage of the XNNPACK weights and code caches.
+  // NOTE: Currently creating/freeing the code cache is not exposed via the public xnnpack.h header so usage is
+  // commented out. If we need to use it, we'll need to add the 'src' directory of XNNPACK to the include path
+  // and #include "xnnpack/cache.h"
+  struct Caches {
+    Caches(bool enable)
+        :  // auto_code_cache(nullptr, xnn_release_code_cache),
+          auto_weights_cache(nullptr, xnn_delete_weights_cache) {
+      if (enable) {
+#ifdef XNN_CACHE_ENABLE
+        xnn_status status = xnn_status_success;
+#if XNN_PLATFORM_JIT
+        // status = xnn_init_code_cache(&code_cache_);
+        // ORT_ENFORCE(status == xnn_status_success, "Failed to initialize XNNPACK code cache");)
+        // auto_code_cache.reset(&code_cache_);
+#endif
+        // status = xnn_init_weights_cache(&weights_cache_);
+        xnn_weights_cache_t weights_cache = nullptr;
+        status = xnn_create_weights_cache(&weights_cache, 0);
+        ORT_ENFORCE(status == xnn_status_success, "Failed to create XNNPACK weights cache");
+        auto_weights_cache.reset(weights_cache);
+#endif
+      }
+    }
+
+    // std::unique_ptr<xnn_code_cache, decltype(&xnn_release_code_cache)> auto_code_cache;
+    std::unique_ptr<xnn_weights_cache, decltype(&xnn_delete_weights_cache)> auto_weights_cache;
+
+    // private:
+    // #if defined(XNN_CACHE_ENABLE) && XNN_PLATFORM_JIT
+    //   xnn_code_cache code_cache_;
+    // #endif
+  };
+
+  Caches caches_;
 };
 }  // namespace xnnpack
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_iobinding.cc b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
index 7638a12bb820c..59d5a77bfbea3 100644
--- a/onnxruntime/python/onnxruntime_pybind_iobinding.cc
+++ b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
@@ -60,8 +60,6 @@ void addIoBindingMethods(pybind11::module& m) {
       })
       // This binds input as a Tensor that wraps memory pointer along with the OrtMemoryInfo
       .def("bind_input", [](SessionIOBinding* io_binding, const std::string& name, const OrtDevice& device, py::object& element_type, const std::vector<int64_t>& shape, int64_t data_ptr) -> void {
-        ORT_ENFORCE(data_ptr != 0, "Pointer to data memory is not valid");
-
         PyArray_Descr* dtype;
         if (!PyArray_DescrConverter(element_type.ptr(), &dtype)) {
           throw std::runtime_error("Not a valid numpy type");
diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc
index 04dfa9b51e112..ff76887e917cd 100644
--- a/onnxruntime/python/onnxruntime_pybind_quant.cc
+++ b/onnxruntime/python/onnxruntime_pybind_quant.cc
@@ -5,7 +5,7 @@
 #include <pybind11/numpy.h>
 #include <pybind11/functional.h>
 
-#include "contrib_ops/cpu/quantization/dequantize_blockwise.h"
+#include "core/mlas/inc/mlas_q4.h"
 #include "contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h"
 #include "core/util/thread_utils.h"
 
@@ -53,15 +53,16 @@ void QuantizeMatMul4BitsBlockwise(
   py::buffer_info scale_buf = scale.request();
   py::buffer_info zp_buf = zero_points.request();
 
-  contrib::QuantizeBlockwise<T>(
-      static_cast<uint8_t*>(dst_buf.ptr),
-      static_cast<const T*>(src_buf.ptr),
-      static_cast<T*>(scale_buf.ptr),
-      is_symmetric ? nullptr : static_cast<uint8_t*>(zp_buf.ptr),
+  MlasQuantizeBlockwise<T, 4>(
+      reinterpret_cast<uint8_t*>(dst_buf.ptr),
+      reinterpret_cast<T*>(scale_buf.ptr),
+      is_symmetric ? nullptr : reinterpret_cast<uint8_t*>(zp_buf.ptr),
+      reinterpret_cast<const T*>(src_buf.ptr),
       block_size,
-      4,
-      N,
+      true,
       K,
+      N,
+      N,
       tp.get());
 }
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
index 9cb937a13ff27..111e156cd6d01 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py
@@ -56,7 +56,7 @@ def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric):
     a = np.random.rand(m, k).astype(dtype)
     b = np.random.randint(low=0, high=127, size=(n, (k + 31) // 32, 16)).astype("uint8")
     scales = np.random.rand(n * ((k + 31) // 32)).astype(dtype)
-    zeropoints = np.random.rand((n * ((k + 31) // 32) + 1) // 2).astype(dtype)
+    zeropoints = np.random.rand(n * (((k + 31) // 32 + 1) // 2)).astype(dtype)
 
     output_d = ke.DeviceArray(output)
     a_d = ke.DeviceArray(a)
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index fea9e5e8cb739..1c3c212b54fa4 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -61,7 +61,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
         # block wise quantization, each block comes from a single column
         packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
         scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-        zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
+        zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8")
         quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.is_symmetric)
 
         return (packed, scales, zero_point)
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index dc8efbbaf3709..918ee0e6eb976 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -14,7 +14,6 @@
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 #include "core/util/qmath.h"
-#include "contrib_ops/cpu/quantization/dequantize_blockwise.h"
 
 #include <chrono>
 #include <random>
@@ -25,6 +24,8 @@
 namespace onnxruntime {
 namespace test {
 
+static constexpr int QBits = 4;
+
 void QuantizeDequantize(std::vector<float>& raw_vals,
                         std::vector<uint8_t>& quant_vals,
                         std::vector<float>& scales,
@@ -35,27 +36,29 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
   OrtThreadPoolParams to;
   auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to,
                                           concurrency::ThreadPoolType::INTRA_OP);
-  contrib::QuantizeBlockwise<float>(
+
+  MlasQuantizeBlockwise<float, 4>(
       quant_vals.data(),
-      raw_vals.data(),
       scales.data(),
       zp != nullptr ? zp->data() : nullptr,
+      raw_vals.data(),
       block_size,
-      4,
-      N,
+      true,
       K,
+      N,
+      N,
       tp.get());
 
   // Note that input1_f_vals is NxK after dequant
-  contrib::DequantizeBlockwise<float>(
-      raw_vals.data(),
-      quant_vals.data(),
-      scales.data(),
-      zp != nullptr ? zp->data() : nullptr,
-      block_size,
-      4,
-      N,
-      K,
+  MlasDequantizeBlockwise<float, 4>(
+      raw_vals.data(),                       // dequantized output
+      quant_vals.data(),                     // quantized input
+      scales.data(),                         // quantization scales
+      zp != nullptr ? zp->data() : nullptr,  // quantization zero points
+      block_size,                            // quantization block size
+      true,                                  // columnwise quantization
+      K,                                     // number of rows
+      N,                                     // number of columns
       tp.get());
 }
 
@@ -69,13 +72,21 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop
   MlasTranspose(input1_f_vals.data(), input1_f_vals_trans.data(), K, N);
 #endif
 
-  int64_t block_per_k = (K + block_size - 1) / block_size;
-  int64_t number_of_block = block_per_k * N;
-  int64_t block_blob_size = block_size * 4 / 8;
-  int64_t buf_size = number_of_block * (block_size * 4 / 8);
-  std::vector<uint8_t> input1_vals(buf_size);
-  std::vector<float> scales(number_of_block);
-  std::vector<uint8_t> zp((N * block_per_k + 1) / 2);
+  int meta_rows;
+  int meta_cols;
+  MlasBlockwiseQuantMetaShape<float>((int)block_size, true, (int)K, (int)N, meta_rows, meta_cols);
+
+  int q_rows;
+  int q_cols;
+  MlasBlockwiseQuantizedShape<float>((int)block_size, true, (int)K, (int)N, q_rows, q_cols);
+
+  std::vector<uint8_t> input1_vals(q_rows * q_cols);
+  std::vector<float> scales(meta_rows * meta_cols);
+
+  // TODO!! THIS SHOULD BE PROVIDED BY MLAS
+  // sub 8b packing always happen on the column dimension
+  const int packed_meta_rows = (meta_rows * QBits + 7) / 8;
+  std::vector<uint8_t> zp(packed_meta_rows * meta_cols);
 
   QuantizeDequantize(input1_f_vals,
                      input1_vals,
@@ -100,13 +111,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop
   test.AddAttribute<int64_t>("K", K);
   test.AddAttribute<int64_t>("N", N);
   test.AddAttribute<int64_t>("block_size", block_size);
-  test.AddAttribute<int64_t>("bits", 4);
+  test.AddAttribute<int64_t>("bits", QBits);
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
-    test.AddInput<uint8_t>("B", {N, block_per_k, block_blob_size}, input1_vals, true);
-    test.AddInput<MLFloat16>("scales", {N * block_per_k}, ToFloat16(scales), true);
+    test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
+    test.AddInput<MLFloat16>("scales", {meta_cols * meta_rows}, ToFloat16(scales), true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {(N * block_per_k + 1) / 2}, zp, true);
+      test.AddInput<uint8_t>("zero_points", {meta_cols * packed_meta_rows}, zp, true);
     }
 
     test.AddOutput<MLFloat16>("Y", {M, N}, ToFloat16(expected_vals));
@@ -117,10 +128,10 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   } else {
     test.AddInput<float>("A", {M, K}, input0_vals, false);
-    test.AddInput<uint8_t>("B", {N, block_per_k, block_blob_size}, input1_vals, true);
-    test.AddInput<float>("scales", {N * block_per_k}, scales, true);
+    test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
+    test.AddInput<float>("scales", {meta_cols * meta_rows}, scales, true);
     if (has_zeropoint) {
-      test.AddInput<uint8_t>("zero_points", {(N * block_per_k + 1) / 2}, zp, true);
+      test.AddInput<uint8_t>("zero_points", {meta_cols * packed_meta_rows}, zp, true);
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_vals);
diff --git a/onnxruntime/test/mlas/unittest/test_activation.cpp b/onnxruntime/test/mlas/unittest/test_activation.cpp
index eb3e35d739bb3..2bb0bbcd35e26 100644
--- a/onnxruntime/test/mlas/unittest/test_activation.cpp
+++ b/onnxruntime/test/mlas/unittest/test_activation.cpp
@@ -256,9 +256,6 @@ class MlasActivationTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasActivationTest* MlasTestFixture<MlasActivationTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   return is_short_execute ? MlasDirectShortExecuteTests<MlasActivationTest>::RegisterShortExecute() : 0;
 });
diff --git a/onnxruntime/test/mlas/unittest/test_blkq8.cpp b/onnxruntime/test/mlas/unittest/test_blkq8.cpp
index 15bbd1b4cb28d..5cff86d411ca9 100644
--- a/onnxruntime/test/mlas/unittest/test_blkq8.cpp
+++ b/onnxruntime/test/mlas/unittest/test_blkq8.cpp
@@ -150,12 +150,6 @@ class MlasBlkQ8ShortExeTest : public MlasTestFixture<MlasBlkQ8Test<Threaded>> {
   size_t M_, K_;
 };
 
-template <>
-MlasBlkQ8Test<true>* MlasTestFixture<MlasBlkQ8Test<true>>::mlas_tester(nullptr);
-
-template <>
-MlasBlkQ8Test<false>* MlasTestFixture<MlasBlkQ8Test<false>>::mlas_tester(nullptr);
-
 static size_t BlkQ8ReisterShortTests() {
   size_t cnt = 0;
   cnt += MlasBlkQ8ShortExeTest<true>::RegisterShortExecuteTests();
diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
index 6f06e0f2eead8..f836da8277bb8 100644
--- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp
+++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp
@@ -96,7 +96,8 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
       }
     }
 
-    MlasDequantizeBlockwise(dequant_buf, elements, scales, zp, block_size, columnwise, rows, columns, threadpool_ptr);
+    MlasDequantizeBlockwise<float, 4>(dequant_buf, elements, scales, zp, block_size,
+                                      columnwise, rows, columns, threadpool_ptr);
 
     MlasTranspose(dequant_buf, transposed, columns, rows);
 
@@ -104,7 +105,8 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
     float* o_scales = OutputScales.GetBuffer(meta_rows * meta_cols);
     uint8_t* o_zp = symmetric ? nullptr : OutputOffsets.GetBuffer(((meta_rows + 1) / 2) * meta_cols, true);
 
-    MlasQuantizeBlockwise(o_elements, o_scales, o_zp, transposed, block_size, columnwise, rows, columns, columns, threadpool_ptr);
+    MlasQuantizeBlockwise<float, 4>(o_elements, o_scales, o_zp, transposed, block_size,
+                                    columnwise, rows, columns, columns, threadpool_ptr);
 
     for (int c = 0; c < columns; c++) {
       for (int r = 0; r < rows; r += 2) {
@@ -194,9 +196,6 @@ class MlasBlockwiseQdqTest : public MlasTestBase {
   MlasBlockwiseQdqTest() = default;
 };
 
-template <>
-MlasBlockwiseQdqTest* MlasTestFixture<MlasBlockwiseQdqTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.cpp b/onnxruntime/test/mlas/unittest/test_conv2d.cpp
index 97560bbfc2e7e..1700cd8f1800f 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d.cpp
+++ b/onnxruntime/test/mlas/unittest/test_conv2d.cpp
@@ -4,11 +4,6 @@
 #include "test_conv2d.h"
 #include "test_conv2d_fixture.h"
 
-template <>
-MlasConv2DTest<false>* MlasTestFixture<MlasConv2DTest<false>>::mlas_tester(nullptr);
-template <>
-MlasConv2DTest<true>* MlasTestFixture<MlasConv2DTest<true>>::mlas_tester(nullptr);
-
 static size_t Conv2dRegistLongExecute() {
   size_t count = MlasLongExecuteTests<MlasConv2DTest<false>>::RegisterLongExecute();
   if (GetMlasThreadPool() != nullptr) {
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp
index 78a047e385b99..e5a536eb9e4f0 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp
+++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp
@@ -4,11 +4,6 @@
 #include "test_conv2d_nchwc.h"
 #include "test_conv2d_fixture.h"
 
-template <>
-MlasNchwcConv2DTest<false>* MlasTestFixture<MlasNchwcConv2DTest<false>>::mlas_tester(nullptr);
-template <>
-MlasNchwcConv2DTest<true>* MlasTestFixture<MlasNchwcConv2DTest<true>>::mlas_tester(nullptr);
-
 static size_t Conv2dNchwcRegistLongExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_exp.cpp b/onnxruntime/test/mlas/unittest/test_exp.cpp
index ce8c4e97748f8..f9cdffef1947d 100644
--- a/onnxruntime/test/mlas/unittest/test_exp.cpp
+++ b/onnxruntime/test/mlas/unittest/test_exp.cpp
@@ -50,9 +50,6 @@ class MlasComputeExpTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasComputeExpTest* MlasTestFixture<MlasComputeExpTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   // no long execute needed
   return is_short_execute ? MlasDirectShortExecuteTests<MlasComputeExpTest>::RegisterShortExecute() : 0;
diff --git a/onnxruntime/test/mlas/unittest/test_fgemm.cpp b/onnxruntime/test/mlas/unittest/test_fgemm.cpp
index 6b8d4529faadb..e3f50baf3633d 100644
--- a/onnxruntime/test/mlas/unittest/test_fgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_fgemm.cpp
@@ -7,24 +7,6 @@
 #include <memory>
 #include <sstream>
 
-template <>
-MlasFgemmTest<float, false, false>* MlasTestFixture<MlasFgemmTest<float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasFgemmTest<float, false, true>* MlasTestFixture<MlasFgemmTest<float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasFgemmTest<float, true, false>* MlasTestFixture<MlasFgemmTest<float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasFgemmTest<float, true, true>* MlasTestFixture<MlasFgemmTest<float, true, true>>::mlas_tester(nullptr);
-
-#ifdef MLAS_SUPPORTS_GEMM_DOUBLE
-
-template <>
-MlasFgemmTest<double, false, false>* MlasTestFixture<MlasFgemmTest<double, false, false>>::mlas_tester(nullptr);
-template <>
-MlasFgemmTest<double, false, true>* MlasTestFixture<MlasFgemmTest<double, false, true>>::mlas_tester(nullptr);
-
-#endif
-
 static size_t FGemmRegistLongExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
index a9e062e0b6534..484a9a22429d5 100644
--- a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
+++ b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
@@ -148,9 +148,6 @@ class MlasFp16ActivationTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasFp16ActivationTest* MlasTestFixture<MlasFp16ActivationTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   return is_short_execute ? MlasDirectShortExecuteTests<MlasFp16ActivationTest>::RegisterShortExecute() : 0;
 });
diff --git a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
index 1a307d339b0f2..2a478675d09eb 100644
--- a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp
@@ -89,42 +89,6 @@ class HalfGemmShortExecuteTest : public MlasTestFixture<MlasHalfGemmTest<AType,
   bool hasBias_;
 };
 
-template <>
-MlasHalfGemmTest<float, MLFp16, false, false>* MlasTestFixture<MlasHalfGemmTest<float, MLFp16, false, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, MLFp16, false, true>* MlasTestFixture<MlasHalfGemmTest<float, MLFp16, false, true>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, MLFp16, true, false>* MlasTestFixture<MlasHalfGemmTest<float, MLFp16, true, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, MLFp16, true, true>* MlasTestFixture<MlasHalfGemmTest<float, MLFp16, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasHalfGemmTest<MLFp16, float, false, false>* MlasTestFixture<MlasHalfGemmTest<MLFp16, float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, float, false, true>* MlasTestFixture<MlasHalfGemmTest<MLFp16, float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, float, true, false>* MlasTestFixture<MlasHalfGemmTest<MLFp16, float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, float, true, true>* MlasTestFixture<MlasHalfGemmTest<MLFp16, float, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasHalfGemmTest<float, float, false, false>* MlasTestFixture<MlasHalfGemmTest<float, float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, float, false, true>* MlasTestFixture<MlasHalfGemmTest<float, float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, float, true, false>* MlasTestFixture<MlasHalfGemmTest<float, float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<float, float, true, true>* MlasTestFixture<MlasHalfGemmTest<float, float, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasHalfGemmTest<MLFp16, MLFp16, false, false>* MlasTestFixture<MlasHalfGemmTest<MLFp16, MLFp16, false, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, MLFp16, false, true>* MlasTestFixture<MlasHalfGemmTest<MLFp16, MLFp16, false, true>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, MLFp16, true, false>* MlasTestFixture<MlasHalfGemmTest<MLFp16, MLFp16, true, false>>::mlas_tester(nullptr);
-template <>
-MlasHalfGemmTest<MLFp16, MLFp16, true, true>* MlasTestFixture<MlasHalfGemmTest<MLFp16, MLFp16, true, true>>::mlas_tester(nullptr);
-
 static size_t HalfGemmRegistLongExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_minmax.cpp b/onnxruntime/test/mlas/unittest/test_minmax.cpp
index f0df504720c0c..245879deccffd 100644
--- a/onnxruntime/test/mlas/unittest/test_minmax.cpp
+++ b/onnxruntime/test/mlas/unittest/test_minmax.cpp
@@ -46,9 +46,6 @@ class MlasFindMinMaxElementsTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasFindMinMaxElementsTest* MlasTestFixture<MlasFindMinMaxElementsTest>::mlas_tester(nullptr);
-
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
diff --git a/onnxruntime/test/mlas/unittest/test_pool2d.cpp b/onnxruntime/test/mlas/unittest/test_pool2d.cpp
index 012e7f25fddce..8cefb8332ec32 100644
--- a/onnxruntime/test/mlas/unittest/test_pool2d.cpp
+++ b/onnxruntime/test/mlas/unittest/test_pool2d.cpp
@@ -4,20 +4,6 @@
 #include "test_pool2d.h"
 #include "test_pool2d_fixture.h"
 
-template <>
-MlasPool2DTest<MlasMaximumPooling, false>* MlasTestFixture<MlasPool2DTest<MlasMaximumPooling, false>>::mlas_tester(nullptr);
-template <>
-MlasPool2DTest<MlasAveragePoolingExcludePad, false>* MlasTestFixture<MlasPool2DTest<MlasAveragePoolingExcludePad, false>>::mlas_tester(nullptr);
-template <>
-MlasPool2DTest<MlasAveragePoolingIncludePad, false>* MlasTestFixture<MlasPool2DTest<MlasAveragePoolingIncludePad, false>>::mlas_tester(nullptr);
-
-template <>
-MlasPool2DTest<MlasMaximumPooling, true>* MlasTestFixture<MlasPool2DTest<MlasMaximumPooling, true>>::mlas_tester(nullptr);
-template <>
-MlasPool2DTest<MlasAveragePoolingExcludePad, true>* MlasTestFixture<MlasPool2DTest<MlasAveragePoolingExcludePad, true>>::mlas_tester(nullptr);
-template <>
-MlasPool2DTest<MlasAveragePoolingIncludePad, true>* MlasTestFixture<MlasPool2DTest<MlasAveragePoolingIncludePad, true>>::mlas_tester(nullptr);
-
 static size_t Pool2dRegistLongExecute() {
   size_t count = 0;
   count += MlasLongExecuteTests<MlasPool2DTest<MlasMaximumPooling, false>>::RegisterLongExecute();
diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp
index 190fbe7d5a6f1..bee690b10b737 100644
--- a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp
+++ b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp
@@ -4,20 +4,6 @@
 #include "test_pool2d_nchwc.h"
 #include "test_pool2d_fixture.h"
 
-template <>
-MlasNchwcPool2DTest<MlasMaximumPooling, false>* MlasTestFixture<MlasNchwcPool2DTest<MlasMaximumPooling, false>>::mlas_tester(nullptr);
-template <>
-MlasNchwcPool2DTest<MlasAveragePoolingExcludePad, false>* MlasTestFixture<MlasNchwcPool2DTest<MlasAveragePoolingExcludePad, false>>::mlas_tester(nullptr);
-template <>
-MlasNchwcPool2DTest<MlasAveragePoolingIncludePad, false>* MlasTestFixture<MlasNchwcPool2DTest<MlasAveragePoolingIncludePad, false>>::mlas_tester(nullptr);
-
-template <>
-MlasNchwcPool2DTest<MlasMaximumPooling, true>* MlasTestFixture<MlasNchwcPool2DTest<MlasMaximumPooling, true>>::mlas_tester(nullptr);
-template <>
-MlasNchwcPool2DTest<MlasAveragePoolingExcludePad, true>* MlasTestFixture<MlasNchwcPool2DTest<MlasAveragePoolingExcludePad, true>>::mlas_tester(nullptr);
-template <>
-MlasNchwcPool2DTest<MlasAveragePoolingIncludePad, true>* MlasTestFixture<MlasNchwcPool2DTest<MlasAveragePoolingIncludePad, true>>::mlas_tester(nullptr);
-
 static size_t Pool2dNchwcRegistLongExecute() {
   size_t count = 0;
   if (MlasNchwcGetBlockSize() > 1) {
diff --git a/onnxruntime/test/mlas/unittest/test_pool3d.cpp b/onnxruntime/test/mlas/unittest/test_pool3d.cpp
index a93698234f7da..e0ce4c240be80 100644
--- a/onnxruntime/test/mlas/unittest/test_pool3d.cpp
+++ b/onnxruntime/test/mlas/unittest/test_pool3d.cpp
@@ -4,20 +4,6 @@
 #include "test_pool3d.h"
 #include "test_pool3d_fixture.h"
 
-template <>
-MlasPool3DTest<MlasMaximumPooling, false>* MlasTestFixture<MlasPool3DTest<MlasMaximumPooling, false>>::mlas_tester(nullptr);
-template <>
-MlasPool3DTest<MlasAveragePoolingExcludePad, false>* MlasTestFixture<MlasPool3DTest<MlasAveragePoolingExcludePad, false>>::mlas_tester(nullptr);
-template <>
-MlasPool3DTest<MlasAveragePoolingIncludePad, false>* MlasTestFixture<MlasPool3DTest<MlasAveragePoolingIncludePad, false>>::mlas_tester(nullptr);
-
-template <>
-MlasPool3DTest<MlasMaximumPooling, true>* MlasTestFixture<MlasPool3DTest<MlasMaximumPooling, true>>::mlas_tester(nullptr);
-template <>
-MlasPool3DTest<MlasAveragePoolingExcludePad, true>* MlasTestFixture<MlasPool3DTest<MlasAveragePoolingExcludePad, true>>::mlas_tester(nullptr);
-template <>
-MlasPool3DTest<MlasAveragePoolingIncludePad, true>* MlasTestFixture<MlasPool3DTest<MlasAveragePoolingIncludePad, true>>::mlas_tester(nullptr);
-
 static size_t Pool3dRegistLongExecute() {
   size_t count = 0;
   count += MlasLongExecuteTests<MlasPool3DTest<MlasMaximumPooling, false>>::RegisterLongExecute();
diff --git a/onnxruntime/test/mlas/unittest/test_q4gemm.cpp b/onnxruntime/test/mlas/unittest/test_q4gemm.cpp
index 2c3bf23a9330b..dccd7d00b6d3f 100644
--- a/onnxruntime/test/mlas/unittest/test_q4gemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q4gemm.cpp
@@ -83,19 +83,6 @@ class Q4GemmShortExecuteTest : public MlasTestFixture<MlasQ4GemmTest<QType, Thre
   bool hasBias_;
 };
 
-template <>
-MlasQ4GemmTest<BlkQ4Sym, false>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Sym, false>>::mlas_tester(nullptr);
-template <>
-MlasQ4GemmTest<BlkQ4Sym, true>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Sym, true>>::mlas_tester(nullptr);
-template <>
-MlasQ4GemmTest<BlkQ4Zp8, false>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Zp8, false>>::mlas_tester(nullptr);
-template <>
-MlasQ4GemmTest<BlkQ4Zp8, true>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Zp8, true>>::mlas_tester(nullptr);
-template <>
-MlasQ4GemmTest<BlkQ4Sym128, false>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Sym128, false>>::mlas_tester(nullptr);
-template <>
-MlasQ4GemmTest<BlkQ4Sym128, true>* MlasTestFixture<MlasQ4GemmTest<BlkQ4Sym128, true>>::mlas_tester(nullptr);
-
 static size_t Q4GemmRegistShortExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
index 8215c63a2cc56..955c3b1201989 100644
--- a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
@@ -141,9 +141,6 @@ class MlasQ4dqTest : public MlasTestBase {
   MlasQ4dqTest() = default;
 };
 
-template <>
-MlasQ4dqTest* MlasTestFixture<MlasQ4dqTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   if (MlasQ4GemmPackBSize(BlkQ4Sym, 32, 32) == 0) {
     return (size_t)0;
diff --git a/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp b/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp
index bac16b0103a6e..a78a3261d1f2a 100644
--- a/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp
@@ -271,19 +271,6 @@ class Q8Q4GemmShortExecuteTest : public MlasTestFixture<MlasQ8Q4GemmTest<QType,
   bool hasBias_;
 };
 
-template <>
-MlasQ8Q4GemmTest<BlkQ4Sym, false>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Sym, false>>::mlas_tester(nullptr);
-template <>
-MlasQ8Q4GemmTest<BlkQ4Sym, true>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Sym, true>>::mlas_tester(nullptr);
-template <>
-MlasQ8Q4GemmTest<BlkQ4Zp8, false>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Zp8, false>>::mlas_tester(nullptr);
-template <>
-MlasQ8Q4GemmTest<BlkQ4Zp8, true>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Zp8, true>>::mlas_tester(nullptr);
-template <>
-MlasQ8Q4GemmTest<BlkQ4Sym128, false>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Sym128, false>>::mlas_tester(nullptr);
-template <>
-MlasQ8Q4GemmTest<BlkQ4Sym128, true>* MlasTestFixture<MlasQ8Q4GemmTest<BlkQ4Sym128, true>>::mlas_tester(nullptr);
-
 static size_t Q8Q4GemmRegistShortExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_qgemm.cpp
index a55331f1377fa..6bb93d35357f8 100644
--- a/onnxruntime/test/mlas/unittest/test_qgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_qgemm.cpp
@@ -1,60 +1,6 @@
 #include "test_qgemm.h"
 #include "test_qgemm_fixture.h"
 
-template <>
-MlasQgemmTest<uint8_t, int8_t, int32_t, false, false>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, int32_t, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, int32_t, false, true>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, int32_t, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, int32_t, true, false>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, int32_t, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, int32_t, true, true>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, int32_t, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasQgemmTest<int8_t, int8_t, int32_t, false, false>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, int32_t, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, int32_t, false, true>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, int32_t, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, int32_t, true, false>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, int32_t, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, int32_t, true, true>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, int32_t, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasQgemmTest<uint8_t, uint8_t, int32_t, false, false>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, int32_t, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, int32_t, false, true>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, int32_t, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, int32_t, true, false>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, int32_t, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, int32_t, true, true>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, int32_t, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasQgemmTest<uint8_t, int8_t, float, false, false>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, float, false, true>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, float, true, false>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, int8_t, float, true, true>* MlasTestFixture<MlasQgemmTest<uint8_t, int8_t, float, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasQgemmTest<int8_t, int8_t, float, false, false>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, float, false, true>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, float, true, false>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<int8_t, int8_t, float, true, true>* MlasTestFixture<MlasQgemmTest<int8_t, int8_t, float, true, true>>::mlas_tester(nullptr);
-
-template <>
-MlasQgemmTest<uint8_t, uint8_t, float, false, false>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, float, false, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, float, false, true>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, float, false, true>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, float, true, false>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, float, true, false>>::mlas_tester(nullptr);
-template <>
-MlasQgemmTest<uint8_t, uint8_t, float, true, true>* MlasTestFixture<MlasQgemmTest<uint8_t, uint8_t, float, true, true>>::mlas_tester(nullptr);
-
 static size_t QGemmRegistLongExecute() {
   size_t count = 0;
 
diff --git a/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp b/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp
index 93dda4bee183b..5876f186eaa0d 100644
--- a/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp
+++ b/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp
@@ -163,11 +163,6 @@ class MlasQLinearMulTest : public MlasQLinearBinaryOpTest {
   }
 };
 
-template <>
-MlasQLinearAddTest* MlasTestFixture<MlasQLinearAddTest>::mlas_tester(nullptr);
-template <>
-MlasQLinearMulTest* MlasTestFixture<MlasQLinearMulTest>::mlas_tester(nullptr);
-
 static bool UNUSED_VARIABLE added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
diff --git a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
index aeb13af5b941a..e6c230df57fbc 100644
--- a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
+++ b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
@@ -162,11 +162,6 @@ class MlasQLinearGlobalAveragePoolTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasQLinearGlobalAveragePoolTest<int8_t>* MlasTestFixture<MlasQLinearGlobalAveragePoolTest<int8_t>>::mlas_tester(nullptr);
-template <>
-MlasQLinearGlobalAveragePoolTest<uint8_t>* MlasTestFixture<MlasQLinearGlobalAveragePoolTest<uint8_t>>::mlas_tester(nullptr);
-
 template <>
 const std::vector<int8_t> MlasQLinearGlobalAveragePoolTest<int8_t>::ZeroPoints = {-128, -110, 1, 103, 127};
 
diff --git a/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp b/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp
index 2832598fef1a9..986d158d2b1b9 100644
--- a/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp
+++ b/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp
@@ -71,15 +71,6 @@ class MlasQuantizeLinearTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasQuantizeLinearTest<int8_t>* MlasTestFixture<MlasQuantizeLinearTest<int8_t>>::mlas_tester(nullptr);
-template <>
-MlasQuantizeLinearTest<uint8_t>* MlasTestFixture<MlasQuantizeLinearTest<uint8_t>>::mlas_tester(nullptr);
-template <>
-MlasQuantizeLinearTest<int16_t>* MlasTestFixture<MlasQuantizeLinearTest<int16_t>>::mlas_tester(nullptr);
-template <>
-MlasQuantizeLinearTest<uint16_t>* MlasTestFixture<MlasQuantizeLinearTest<uint16_t>>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
index 21373fe9f66e7..e39abd8578da4 100644
--- a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
+++ b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
@@ -88,9 +88,6 @@ class MlasReorderOutputTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasReorderOutputTest* MlasTestFixture<MlasReorderOutputTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   return (MlasNchwcGetBlockSize() > 1 && is_short_execute)
              ? MlasDirectShortExecuteTests<MlasReorderOutputTest>::RegisterShortExecute()
diff --git a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
index 7732b1fa8c72e..34f17843b0726 100644
--- a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
+++ b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp
@@ -77,9 +77,6 @@ class MlasScaleOutputTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasScaleOutputTest* MlasTestFixture<MlasScaleOutputTest>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   return is_short_execute ? MlasDirectShortExecuteTests<MlasScaleOutputTest>::RegisterShortExecute() : 0;
 });
diff --git a/onnxruntime/test/mlas/unittest/test_softmax.cpp b/onnxruntime/test/mlas/unittest/test_softmax.cpp
index 3df2b88f9652a..4c5e11bbe9566 100644
--- a/onnxruntime/test/mlas/unittest/test_softmax.cpp
+++ b/onnxruntime/test/mlas/unittest/test_softmax.cpp
@@ -97,11 +97,6 @@ class MlasSoftmaxTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasSoftmaxTest<false>* MlasTestFixture<MlasSoftmaxTest<false>>::mlas_tester(nullptr);
-template <>
-MlasSoftmaxTest<true>* MlasTestFixture<MlasSoftmaxTest<true>>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
diff --git a/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp
index adfe5564ebbbf..bb3aea02cc011 100644
--- a/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp
@@ -1,10 +1,5 @@
 #include "test_symm_qgemm_fixture.h"
 
-template <>
-MlasSymmQgemmTest<int8_t, int32_t, false>* MlasTestFixture<MlasSymmQgemmTest<int8_t, int32_t, false>>::mlas_tester(nullptr);
-template <>
-MlasSymmQgemmTest<int8_t, int32_t, true>* MlasTestFixture<MlasSymmQgemmTest<int8_t, int32_t, true>>::mlas_tester(nullptr);
-
 static size_t SymmQgemmRegistLongExecute() {
   if (MlasSymmQgemmPackBSize(16, 16, true) == 0) {
     return 0;
diff --git a/onnxruntime/test/mlas/unittest/test_transpose.cpp b/onnxruntime/test/mlas/unittest/test_transpose.cpp
index 74ce5868f411d..8fa98411a21ab 100644
--- a/onnxruntime/test/mlas/unittest/test_transpose.cpp
+++ b/onnxruntime/test/mlas/unittest/test_transpose.cpp
@@ -45,13 +45,6 @@ class MlasTransposeTest : public MlasTestBase {
   }
 };
 
-template <>
-MlasTransposeTest<uint32_t>* MlasTestFixture<MlasTransposeTest<uint32_t>>::mlas_tester(nullptr);
-template <>
-MlasTransposeTest<uint16_t>* MlasTestFixture<MlasTransposeTest<uint16_t>>::mlas_tester(nullptr);
-template <>
-MlasTransposeTest<uint8_t>* MlasTestFixture<MlasTransposeTest<uint8_t>>::mlas_tester(nullptr);
-
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
   size_t count = 0;
   if (is_short_execute) {
diff --git a/onnxruntime/test/mlas/unittest/test_util.h b/onnxruntime/test/mlas/unittest/test_util.h
index c5ee8b4b6115a..db528ef7291cc 100644
--- a/onnxruntime/test/mlas/unittest/test_util.h
+++ b/onnxruntime/test/mlas/unittest/test_util.h
@@ -188,8 +188,7 @@ class MlasTestFixture : public testing::Test {
     mlas_tester = nullptr;
   };
 
-  // Do not forgot to define this static member element when upon usage.
-  static TMlasTester* mlas_tester;
+  static inline TMlasTester* mlas_tester = nullptr;
 };
 
 // Long Execute test. It is too heavy to register each single test, treat long execute big groups.
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index a6aa4b946f397..e0f63ea58e772 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -32,6 +32,7 @@
 #include "core/optimizer/conv_add_fusion.h"
 #include "core/optimizer/conv_bn_fusion.h"
 #include "core/optimizer/matmul_bn_fusion.h"
+#include "core/optimizer/pad_fusion.h"
 #include "core/optimizer/conv_mul_fusion.h"
 #include "core/optimizer/div_mul_fusion.h"
 #include "core/optimizer/dropout_elimination.h"
@@ -1080,6 +1081,163 @@ TEST_F(GraphTransformationTests, FuseConvBNNoBias) {
   }
 }
 
+TEST_F(GraphTransformationTests, FusePadWithConv) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-conv.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::vector<int64_t> expected_pads;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "Pad") {
+      const auto* pads_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[1]->Name());
+      Initializer pads{*pads_proto, graph.ModelPath()};
+      gsl::span<const int64_t> pads_values = pads.DataAsSpan<int64_t>();
+      expected_pads.resize(pads_values.size() - 4);
+
+      for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) {
+        expected_pads[index] = pads_values[pads_index];
+        expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)];
+      }
+    } else if (node.OpType() == "Conv") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        expected_pads[index] += child_pads->Get(index);
+      }
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<PadFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["Pad"], 0);
+  ASSERT_EQ(op_to_count["Conv"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "Conv") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      ASSERT_EQ(child_pads->size(), static_cast<int32_t>(expected_pads.size()))
+          << "fusion should produce the same size of pads integer as the Conv node";
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        ASSERT_EQ(expected_pads[index], child_pads->Get(index))
+            << "fusion does not produce correct padding value";
+      }
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FusePadWithMaxPool) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-maxpool.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::vector<int64_t> expected_pads;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "Pad") {
+      const auto* pads_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[1]->Name());
+      Initializer pads{*pads_proto, graph.ModelPath()};
+      gsl::span<const int64_t> pads_values = pads.DataAsSpan<int64_t>();
+      expected_pads.resize(pads_values.size() - 4);
+
+      for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) {
+        expected_pads[index] = pads_values[pads_index];
+        expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)];
+      }
+    } else if (node.OpType() == "MaxPool") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        expected_pads[index] += child_pads->Get(index);
+      }
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<PadFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["Pad"], 0);
+  ASSERT_EQ(op_to_count["MaxPool"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "MaxPool") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      ASSERT_EQ(child_pads->size(), static_cast<int32_t>(expected_pads.size()))
+          << "fusion should produce the same size of pads integer as the MaxPool node";
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        ASSERT_EQ(expected_pads[index], child_pads->Get(index))
+            << "fusion does not produce correct padding value";
+      }
+    }
+  }
+}
+
+TEST_F(GraphTransformationTests, FusePadWithMaxPoolOpsetLessThan11) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-maxpool-opset8.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  std::vector<int64_t> expected_pads;
+  GraphViewer graphViewer(graph);
+  for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) {
+    auto& node = *graph.GetNode(node_index);
+    if (node.OpType() == "Pad") {
+      gsl::span<const int64_t> pads_values = node.GetAttributes().at("pads").ints();
+      expected_pads.resize(pads_values.size() - 4);
+
+      for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) {
+        expected_pads[index] = pads_values[pads_index];
+        expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)];
+      }
+    } else if (node.OpType() == "MaxPool") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        expected_pads[index] += child_pads->Get(index);
+      }
+    }
+  }
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  auto rule_transformer_L1 = std::make_unique<RuleBasedGraphTransformer>("RuleTransformerL1");
+  ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique<PadFusion>()));
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1));
+
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["Pad"], 0);
+  ASSERT_EQ(op_to_count["MaxPool"], 1);
+
+  for (auto& node : graph.Nodes()) {
+    if (node.OpType() == "MaxPool") {
+      auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints();
+      ASSERT_EQ(child_pads->size(), static_cast<int32_t>(expected_pads.size()))
+          << "fusion should produce the same size of pads integer as the MaxPool node";
+      for (uint32_t index = 0; index < expected_pads.size(); index++) {
+        ASSERT_EQ(expected_pads[index], child_pads->Get(index))
+            << "fusion does not produce correct padding value";
+      }
+    }
+  }
+}
+
 TEST_F(GraphTransformationTests, FuseMatmulBNWithInBetweenNodes) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-with-reshape.onnx";
 
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 459a8c71ad611..16cce85f7cb0a 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -399,6 +399,8 @@ bool SetEpsForAllNodes(Graph& graph,
                        const std::vector<std::unique_ptr<IExecutionProvider>>& execution_providers,
                        const std::vector<std::shared_ptr<CustomRegistry>>* custom_registries) {
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
+  const KernelRegistry::TypeConstraintMap type_constraint_map{};
+
   for (auto& node : graph.Nodes()) {
     if (node.OpType() == kConstant)
       continue;
@@ -426,13 +428,28 @@ bool SetEpsForAllNodes(Graph& graph,
         break;
       }
 
+      // check the internal NHWC domain if EP requests NHWC as it may only have a kernel registered in that domain
+      if (ep->GetPreferredLayout() == DataLayout::NHWC) {
+        const KernelCreateInfo* kci = nullptr;
+        auto status = ep->GetKernelRegistry()->TryFindKernel(ep->Type(),
+                                                             std::string_view(node.OpType()),
+                                                             std::string_view(kMSInternalNHWCDomain),
+                                                             node.SinceVersion(),
+                                                             type_constraint_map,
+                                                             &kci);
+        if (status.IsOK() && kci != nullptr) {
+          found = true;
+          break;
+        }
+      }
+
       // Check the EP has an impl for the node from custom_registries
       if (custom_registries != nullptr &&
           std::any_of(custom_registries->cbegin(), custom_registries->cend(),
-                      [&](auto reg) { return KernelRegistry::HasImplementationOf(
-                                          *reg->GetKernelRegistry(),
-                                          node, ep->Type(),
-                                          kernel_type_str_resolver); })) {
+                      [&](auto reg) {
+                        return KernelRegistry::HasImplementationOf(*reg->GetKernelRegistry(), node, ep->Type(),
+                                                                   kernel_type_str_resolver);
+                      })) {
         found = true;
         break;
       }
@@ -760,7 +777,7 @@ void BaseTester::ExecuteModelForEps(
     for (const auto& ep : execution_providers) {
       providers.append(ep->Type() + " ");
     }
-    LOGS_DEFAULT(WARNING) << "registered execution providers " << providers << "were unable to run the model.";
+    LOGS_DEFAULT(WARNING) << "registered execution providers " << providers << " were unable to run the model.";
     return;
   }
 
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
index be0082f95feb8..13d4546d669e3 100644
--- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
@@ -21,7 +21,7 @@ struct ConvOp {
   std::unique_ptr<CompareOpTester> get_test() {
     RandomValueGenerator random{};
 
-    auto test = std::make_unique<CompareOpTester>("Conv", 7);
+    auto test = std::make_unique<CompareOpTester>("Conv", 11);  // internal NHWC domain starts at opset 11
     std::vector<T> input_data = random.Uniform<T>(input_dims, 0.0f, 1.0f);
 
     std::vector<int64_t> weight_dims{channels, input_dims[1] / group, kernel_shape[0], kernel_shape[1]};
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index f7499fd7ad812..8955a83e66c01 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -203,7 +203,8 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
 }
 
 TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model.onnx";
+  // the internal NHWC domain supports opset 11 and later
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx";
 
   SessionOptions so;
   // set this if you want to manually inspect the optimized model
diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp
index e92f0ae770a88..792dbeadfa758 100644
--- a/onnxruntime/test/providers/qnn/pad_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp
@@ -167,7 +167,7 @@ TEST_F(QnnCPUBackendTests, Pad2dPadsNotIni) {
 TEST_F(QnnCPUBackendTests, DISABLED_PadModeReflect) {
   bool has_constant_value = false;
   RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
-               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<int64_t>({4}, true, {0, 1, 0, 0}),
                TestInputDef<float>({1}, true, {0.0f}),
                {utils::MakeAttribute("mode", "reflect")},
                ExpectedEPNodeAssignment::All,
@@ -266,13 +266,37 @@ TEST_F(QnnHTPBackendTests, PadHasConstantValueQuantized) {
                            constant_value_quantized);
 }
 
-// QNN graph execute error. Error code: 6031
-TEST_F(QnnHTPBackendTests, DISABLED_PadReflectMode) {
+TEST_F(QnnHTPBackendTests, PadReflectMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 1, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "reflect")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+// Pad amount should not be greater than shape(input[0])[i] - 1
+TEST_F(QnnHTPBackendTests, PadReflectModeOutOfRangePadAmount) {
   bool has_constant_value_input = false;
   RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
                            TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
                            TestInputDef<float>({1}, true, {0.0f}),
                            {utils::MakeAttribute("mode", "reflect")},
+                           ExpectedEPNodeAssignment::None,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, Pad4dReflectMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false,
+                                               {1.0f, 2.0f,
+                                                3.0f, 4.0f,
+                                                5.0f, 6.0f,
+                                                7.0f, 8.0f}),
+                           TestInputDef<int64_t>({8}, true, {0, 1, 1, 1, 0, 1, 1, 1}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "reflect")},
                            ExpectedEPNodeAssignment::All,
                            has_constant_value_input);
 }
diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py
index 54ef1cc1d5446..bac0f6d48e9fc 100644
--- a/onnxruntime/test/python/quantization/test_op_gemm.py
+++ b/onnxruntime/test/python/quantization/test_op_gemm.py
@@ -192,24 +192,9 @@ def static_quant_test(
         check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
         data_reader.rewind()
         if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn":
-            # QGemm is not implemented for CPU.
-            try:
-                check_model_correctness(
-                    self,
-                    model_fp32_path,
-                    model_int8_path,
-                    data_reader.get_next(),
-                    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
-                    is_gemm=True,
-                )
-            except Exception as e:
-                if (
-                    "Type 'tensor(float8e4m3fn)' of input parameter (input_quantized) of operator (QGemm) in node () is invalid."
-                    in str(e)
-                ):
-                    warnings.warn("Fix this test when QGemm is implemented.")
-                    return
-                raise e
+            # QGemm for float 8 is not implemented. The test should be updated when it is.
+            warnings.warn("Fix this test when QGemm is implemented for float 8 types.")
+            return
         else:
             check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next(), is_gemm=True)
 
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
index e03a0167d070a..765825d4b86e3 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
@@ -38,8 +38,8 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i
         matrix_float_padded = np.pad(matrix_float, ((0, pad_len), (0, 0)), "constant")
 
     packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
-    scales = np.zeros((cols * k_blocks), dtype=matrix_float_padded.dtype)
-    zero_point = np.full((cols * k_blocks + 1) // 2, 136, dtype="uint8")
+    scales = np.zeros((cols, k_blocks), dtype=matrix_float_padded.dtype)
+    zero_point = np.full((cols, (k_blocks + 1) // 2), 136, dtype="uint8")
 
     matrix_float_padded = np.transpose(matrix_float_padded)
     for n in range(cols):
@@ -61,10 +61,12 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i
                 zp = min(15, max(0, round(zero_point_fp)))
 
             reciprocal_scale = 1.0 / scale if scale != 0 else 0.0
-            block_idx = n * k_blocks + k_id // block_size
-            scales[block_idx] = scale
-            zp_pair = zero_point[block_idx // 2]
-            zero_point[block_idx // 2] = ((zp_pair & 0x0F) | (zp << 4)) if (block_idx & 1) else ((zp_pair & 0xF0) | zp)
+            block_idx = k_id // block_size
+            scales[n, block_idx] = scale
+            zp_pair = zero_point[n, block_idx // 2]
+            zero_point[n, block_idx // 2] = (
+                ((zp_pair & 0x0F) | (zp << 4)) if (block_idx & 1) else ((zp_pair & 0xF0) | zp)
+            )
 
             blk_int0 = np.clip(
                 np.round(np.float32(matrix_float_padded[n, k_id : k_id + block_size : 2] * reciprocal_scale + zp)),
@@ -76,7 +78,7 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i
                 0,
                 15,
             ).astype("uint8")
-            packed[n, k_id // block_size] = np.bitwise_or(blk_int0, np.left_shift(blk_int1, 4))
+            packed[n, block_idx] = np.bitwise_or(blk_int0, np.left_shift(blk_int1, 4))
 
     return (packed, scales, zero_point)
 
@@ -88,8 +90,8 @@ def quantize_blockwise_4bits_target(matrix_float: npt.ArrayLike, block_size: int
 
     k_blocks = (rows + block_size - 1) // block_size
     packed = np.zeros((cols, k_blocks, block_size // 2), dtype="uint8")
-    scales = np.zeros((cols * k_blocks), dtype=matrix_float.dtype)
-    zero_point = np.full((cols * k_blocks + 1) // 2, 136, dtype="uint8")
+    scales = np.zeros((cols, k_blocks), dtype=matrix_float.dtype)
+    zero_point = np.full((cols, (k_blocks + 1) // 2), 136, dtype="uint8")
     from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
     quantize_matmul_4bits(packed, matrix_float, scales, zero_point, block_size, cols, rows, is_symmetric)
@@ -116,24 +118,22 @@ def test_quantize_blockwise_4bits(self):
                         assert np.allclose(zero_point_ref, zero_point)
                         for c in range(quant_value_ref.shape[0]):
                             for k in range(quant_value_ref.shape[1]):
-                                block_idx = c * quant_value_ref.shape[1] + k
-                                zp_idx = block_idx // 2
                                 assert np.allclose(
                                     dequantize_blockwise_4bits(
-                                        quant_value_ref[c][k],
-                                        scales_ref[block_idx],
-                                        (zero_point_ref[zp_idx] >> 4)
-                                        if (block_idx & 1)
-                                        else (zero_point_ref[zp_idx] & 0x0F),
+                                        quant_value_ref[c, k],
+                                        scales_ref[c, k],
+                                        (zero_point_ref[c, k // 2] >> 4)
+                                        if (k & 1)
+                                        else (zero_point_ref[c, k // 2] & 0x0F),
                                         min(block_size, rows - k * block_size),
                                     ),
                                     dequantize_blockwise_4bits(
-                                        quant_value[c][k],
-                                        scales[block_idx],
-                                        (zero_point[zp_idx] >> 4) if (block_idx & 1) else (zero_point[zp_idx] & 0x0F),
+                                        quant_value[c, k],
+                                        scales[c, k],
+                                        (zero_point[c, k // 2] >> 4) if (k & 1) else (zero_point[c, k // 2] & 0x0F),
                                         min(block_size, rows - k * block_size),
                                     ),
-                                    atol=1.2 * abs(scales[block_idx]),
+                                    atol=1.2 * abs(scales[c, k]),
                                 )
 
 
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 33d50f90333cf..7dee0bc41a6f3 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -2832,6 +2832,58 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) {
 #endif
 
 #ifdef USE_TENSORRT
+TEST(TensorrtExecutionProviderTest, ShapeTensorTest) {
+  const auto& api = Ort::GetApi();
+
+  // Test input tensor which is shape tensor with explicit trt profile shapes
+  Ort::SessionOptions session_options;
+  OrtTensorRTProviderOptionsV2* trt_options;
+  ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options) == nullptr);
+  std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)>
+      rel_trt_options(trt_options, api.ReleaseTensorRTProviderOptions);
+
+  const char* trt_profile_min_shapes = "data:2x2,shape:4x1";
+  const char* trt_profile_max_shapes = "data:2x2,shape:4x1";
+  const char* trt_profile_opt_shapes = "data:2x2,shape:4x1";
+  std::vector<const char*> keys{"trt_profile_min_shapes", "trt_profile_max_shapes", "trt_profile_opt_shapes"};
+  std::vector<const char*> values{trt_profile_min_shapes, trt_profile_max_shapes, trt_profile_opt_shapes};
+  ASSERT_TRUE(api.UpdateTensorRTProviderOptions(rel_trt_options.get(), keys.data(), values.data(), keys.size()) == nullptr);
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_trt_options.get()) == nullptr);
+
+  auto model_path = ORT_TSTR("testdata/trt_reshape.onnx");
+
+  std::vector<float> input_value_0{1.1f, 1.2f, 1.3f, 1.4f};
+  std::vector<int64_t> input_shape_0{2, 2};
+  std::vector<int64_t> input_value_1{4, 1};
+  std::vector<int64_t> input_shape_1{2};
+
+  std::vector<const char*> input_names{"data", "shape"};
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  std::vector<Ort::Value> ort_inputs;
+  ort_inputs.emplace_back(Ort::Value::CreateTensor<float>(info, input_value_0.data(), input_value_0.size(), input_shape_0.data(), input_shape_0.size()));
+  ort_inputs.emplace_back(Ort::Value::CreateTensor<int64_t>(info, input_value_1.data(), input_value_1.size(), input_shape_1.data(), input_shape_1.size()));
+
+  const char* output_names[] = {"reshaped"};
+
+  Ort::Session session(*ort_env, model_path, session_options);
+  session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), output_names, countof(output_names));
+
+  // Test input tensor which is shape tensor with implicit trt profile shapes
+  Ort::SessionOptions session_options_2;
+  OrtTensorRTProviderOptionsV2* trt_options_2;
+  ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options_2) == nullptr);
+  std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)>
+      rel_trt_options_2(trt_options_2, api.ReleaseTensorRTProviderOptions);
+  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2(
+                  static_cast<OrtSessionOptions*>(session_options_2),
+                  rel_trt_options_2.get()) == nullptr);
+  Ort::Session session_2(*ort_env, model_path, session_options_2);
+  session_2.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), output_names, countof(output_names));
+}
+
 TEST(CApiTest, TestExternalCUDAStreamWithIOBinding) {
   const auto& api = Ort::GetApi();
   Ort::SessionOptions session_options;
diff --git a/onnxruntime/test/testdata/squeezenet/model_opset11.onnx b/onnxruntime/test/testdata/squeezenet/model_opset11.onnx
new file mode 100644
index 0000000000000..dcf322a58c042
Binary files /dev/null and b/onnxruntime/test/testdata/squeezenet/model_opset11.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx
new file mode 100644
index 0000000000000..ced1950005985
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx
new file mode 100644
index 0000000000000..feb1f024ceed7
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx differ
diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx
new file mode 100644
index 0000000000000..32e959262f6b5
Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx differ
diff --git a/onnxruntime/test/testdata/trt_reshape.onnx b/onnxruntime/test/testdata/trt_reshape.onnx
new file mode 100644
index 0000000000000..7d195af2ae204
--- /dev/null
+++ b/onnxruntime/test/testdata/trt_reshape.onnx
@@ -0,0 +1,16 @@
+	:�
+)
+data
+shapereshapedReshape"Reshapetrt_engine_wrapperZ
+data
+	
+N
+Z
+shape
+
+
+b
+reshaped
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/trt_reshape_test.py b/onnxruntime/test/testdata/trt_reshape_test.py
new file mode 100644
index 0000000000000..42777bd3d50c7
--- /dev/null
+++ b/onnxruntime/test/testdata/trt_reshape_test.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def generate_model(model_name):
+    nodes = [
+        helper.make_node(
+            "Reshape",
+            ["data", "shape"],
+            ["reshaped"],
+            "Reshape",
+        ),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "trt_engine_wrapper",
+        [  # input
+            helper.make_tensor_value_info("data", TensorProto.FLOAT, ["N", 2]),
+            helper.make_tensor_value_info(
+                "shape",
+                TensorProto.INT64,
+                [
+                    2,
+                ],
+            ),
+        ],
+        [  # output
+            helper.make_tensor_value_info("reshaped", TensorProto.FLOAT, [4, 1]),
+        ],
+    )
+
+    model = helper.make_model(graph)
+    onnx.save(model, model_name)
+
+
+if __name__ == "__main__":
+    generate_model("trt_reshape.onnx")
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index a992da8ff993e..25d69ef3a1eb5 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -796,6 +796,7 @@ def run_subprocess(
 
     my_env.update(env)
 
+    log.info(" ".join(args))
     return run(*args, cwd=cwd, capture_stdout=capture_stdout, shell=shell, env=my_env)
 
 
@@ -2024,13 +2025,6 @@ def build_python_wheel(
         run_subprocess(args, cwd=cwd)
 
 
-def derive_linux_build_property():
-    if is_windows():
-        return '/p:IsLinuxBuild="false"'
-    else:
-        return '/p:IsLinuxBuild="true"'
-
-
 def build_nuget_package(
     cmake_path,
     source_dir,
@@ -2043,7 +2037,6 @@ def build_nuget_package(
     use_dnnl,
     use_tvm,
     use_winml,
-    use_snpe,
     use_qnn,
     enable_training_apis,
     msbuild_extra_options,
@@ -2054,83 +2047,93 @@ def build_nuget_package(
         )
 
     csharp_build_dir = os.path.join(source_dir, "csharp")
-    is_linux_build = derive_linux_build_property()
 
     # in most cases we don't want/need to include the Xamarin mobile targets, as doing so means the Xamarin
     # mobile workloads must be installed on the machine.
     # they are only included in the Microsoft.ML.OnnxRuntime nuget package
     sln = "OnnxRuntime.DesktopOnly.CSharp.sln"
+    have_exclude_mobile_targets_option = "IncludeMobileTargets=false" in msbuild_extra_options
 
     # derive package name and execution provider based on the build args
     target_name = "/t:CreatePackage"
-    execution_provider = '/p:ExecutionProvider="None"'
-    package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime"'
-    enable_training_tests = '/p:TrainingEnabledNativeBuild="false"'
+    execution_provider = "/p:ExecutionProvider=None"
+    package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime"
+    enable_training_tests = "/p:TrainingEnabledNativeBuild=false"
+
     if enable_training_apis:
-        enable_training_tests = '/p:TrainingEnabledNativeBuild="true"'
+        enable_training_tests = "/p:TrainingEnabledNativeBuild=true"
         if use_cuda:
-            package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Training.Gpu"'
+            package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training.Gpu"
         else:
-            package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Training"'
+            package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training"
     elif use_winml:
-        package_name = '/p:OrtPackageId="Microsoft.AI.MachineLearning"'
+        package_name = "/p:OrtPackageId=Microsoft.AI.MachineLearning"
         target_name = "/t:CreateWindowsAIPackage"
     elif use_openvino:
-        execution_provider = '/p:ExecutionProvider="openvino"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.OpenVino"'
+        execution_provider = "/p:ExecutionProvider=openvino"
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino"
     elif use_tensorrt:
-        execution_provider = '/p:ExecutionProvider="tensorrt"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.TensorRT"'
+        execution_provider = "/p:ExecutionProvider=tensorrt"
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT"
     elif use_dnnl:
-        execution_provider = '/p:ExecutionProvider="dnnl"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.DNNL"'
+        execution_provider = "/p:ExecutionProvider=dnnl"
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.DNNL"
     elif use_cuda:
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu"
     elif use_rocm:
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"'
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm"
     elif use_tvm:
-        execution_provider = '/p:ExecutionProvider="tvm"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Tvm"'
-    elif use_snpe:
-        execution_provider = '/p:ExecutionProvider="snpe"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Snpe"'
+        execution_provider = "/p:ExecutionProvider=tvm"
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm"
     elif use_qnn:
-        execution_provider = '/p:ExecutionProvider="qnn"'
-        package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.QNN"'
+        execution_provider = "/p:ExecutionProvider=qnn"
+        package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN"
     elif any(map(lambda x: "OrtPackageId=" in x, msbuild_extra_options)):
         pass
     else:
-        # use the solution file that includes Xamarin mobile targets
-        sln = "OnnxRuntime.CSharp.sln"
+        # we currently only allow building with mobile targets on Windows.
+        # it should be possible to allow building with android targets on Linux but that requires updating the
+        # csproj to separate the inclusion of ios and android targets.
+        if is_windows() and have_exclude_mobile_targets_option is False:
+            # use the sln that include the mobile targets
+            sln = "OnnxRuntime.CSharp.sln"
+
+    # explicitly exclude mobile targets in this case
+    if sln != "OnnxRuntime.CSharp.sln" and have_exclude_mobile_targets_option is False:
+        msbuild_extra_options.append("IncludeMobileTargets=false")
+
+    # expand extra_options to add prefix
+    extra_options = ["/p:" + option for option in msbuild_extra_options]
+
+    # we have to use msbuild directly if including Xamarin targets as dotnet only supports MAUI (.net6)
+    use_dotnet = sln != "OnnxRuntime.CSharp.sln"
+
+    if use_dotnet:
+        cmd_args = ["dotnet", "restore", sln, "--configfile", "NuGet.CSharp.config", *extra_options]
+    else:
+        cmd_args = ["msbuild", sln, "/t:restore", "/p:RestoreConfigFile=NuGet.CSharp.config", *extra_options]
 
     # set build directory based on build_dir arg
     native_dir = os.path.normpath(os.path.join(source_dir, build_dir))
-    ort_build_dir = '/p:OnnxRuntimeBuildDirectory="' + native_dir + '"'
+    ort_build_dir = "/p:OnnxRuntimeBuildDirectory=" + native_dir
 
-    # dotnet restore
-    cmd_args = ["dotnet", "restore", sln, "--configfile", "NuGet.CSharp.config"]
     run_subprocess(cmd_args, cwd=csharp_build_dir)
 
     # build csharp bindings and create nuget package for each config
     for config in configs:
-        if is_linux():
-            native_build_dir = os.path.join(native_dir, config)
-            cmd_args = [cmake_path, "-DCMAKE_INSTALL_PREFIX=./nuget-staging/usr/local", "-Pcmake_install.cmake"]
-            run_subprocess(cmd_args, cwd=native_build_dir)
-
-        configuration = '/p:Configuration="' + config + '"'
-
+        configuration = "/p:Configuration=" + config
         if not use_winml:
-            cmd_args = [
-                "dotnet",
+            cmd_args = ["dotnet"] if use_dotnet else []
+            cmd_args += [
                 "msbuild",
                 sln,
                 configuration,
                 package_name,
-                is_linux_build,
                 ort_build_dir,
                 enable_training_tests,
+                *extra_options,
             ]
+
             run_subprocess(cmd_args, cwd=csharp_build_dir)
         else:
             winml_interop_dir = os.path.join(source_dir, "csharp", "src", "Microsoft.AI.MachineLearning.Interop")
@@ -2141,7 +2144,7 @@ def build_nuget_package(
                 "msbuild",
                 winml_interop_project,
                 configuration,
-                '/p:Platform="Any CPU"',
+                "/p:Platform=Any CPU",
                 ort_build_dir,
                 "-restore",
             ]
@@ -2155,26 +2158,28 @@ def build_nuget_package(
                 # this path is setup by cmake/nuget_helpers.cmake for MSVC on Windows
                 nuget_exe = os.path.normpath(os.path.join(native_dir, config, "nuget_exe", "src", "nuget.exe"))
         else:
-            # user needs to make sure nuget is installed and can be found
-            nuget_exe = "nuget"
+            # `dotnet pack` is used on Linux
+            nuget_exe = "NugetExe_not_set"
 
         nuget_exe_arg = '/p:NugetExe="' + nuget_exe + '"'
 
-        cmd_args = [
-            "dotnet",
+        cmd_args = ["dotnet"] if use_dotnet else []
+        cmd_args += [
             "msbuild",
             "OnnxRuntime.CSharp.proj",
             target_name,
             package_name,
             configuration,
             execution_provider,
-            is_linux_build,
             ort_build_dir,
             nuget_exe_arg,
+            *extra_options,
         ]
-        cmd_args.extend(msbuild_extra_options)
+
         run_subprocess(cmd_args, cwd=csharp_build_dir)
 
+        log.info(f"nuget package was created in the {config} build output directory.")
+
 
 def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt, use_dnnl, enable_training_apis):
     # Currently only running tests on windows.
@@ -2637,6 +2642,7 @@ def main():
                 enable_training_apis=args.enable_training_apis,
                 enable_rocm_profiling=args.enable_rocm_profiling,
             )
+
         if args.build_nuget:
             build_nuget_package(
                 cmake_path,
@@ -2650,7 +2656,6 @@ def main():
                 args.use_dnnl,
                 args.use_tvm,
                 args.use_winml,
-                args.use_snpe,
                 args.use_qnn,
                 args.enable_training_apis,
                 normalize_arg_list(args.msbuild_extra_options),
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 14a9bbedf09a0..ac07d8c525372 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -716,44 +716,29 @@ stages:
         versionSpec: 6.2.1
 
     - task: PowerShell@2
-      displayName: Install .NET 6 workloads
+      displayName: Install mobile workloads
       inputs:
         targetType: 'inline'
         script: |
-          dotnet workload install android ios macos
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: PowerShell@2
-      displayName: Build .NET 6 targets using dotnet
-      inputs:
-        targetType: 'inline'
-        # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
-        # which is inconsistent with the msbuild output path for the pre-.net6 targets
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
-        # and makes it harder to do the packing
-        #
-        # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
-        script: |
-          dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+          dotnet workload install android ios
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: MSBuild@1
-      displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
+      displayName: 'Restore NuGet Packages and create project.assets.json'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
+        msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: MSBuild@1
-      displayName: 'Build C# for pre-.net6 targets'
+      displayName: 'Build C# bindings'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         configuration: RelWithDebInfo
         platform: 'Any CPU'
-        msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - template: templates/win-esrp-dll.yml
@@ -762,15 +747,6 @@ stages:
         DisplayName: 'ESRP - Sign C# dlls'
         DoEsrp: ${{ parameters.DoEsrp }}
 
-    - task: MSBuild@1
-      displayName: Update projects.assets.json with combined list of all target frameworks
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
     - task: MSBuild@1
       displayName: 'Build Nuget Packages'
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 81e8d67b79021..2d92108efb46d 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -137,7 +137,7 @@ stages:
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages'
         inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
           platform: 'Any CPU'
           configuration: '$(BuildConfig)'
           msbuildArguments: '-t:restore -p:OrtPackageId=${{ parameters.OrtPackageId }}'
@@ -146,7 +146,7 @@ stages:
       - task: MSBuild@1
         displayName: 'Build C#'
         inputs:
-          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
           configuration: '$(BuildConfig)'
           platform: 'Any CPU'
           msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=${{ parameters.OrtPackageId }} -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index af245c99700ec..4ce39ecc35bfb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -398,44 +398,29 @@ stages:
         versionSpec: 6.2.1
 
     - task: PowerShell@2
-      displayName: Install .NET 6 workloads
+      displayName: Install mobile workloads
       inputs:
         targetType: 'inline'
         script: |
-          dotnet workload install android ios macos
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: PowerShell@2
-      displayName: Build Microsoft.ML.OnnxRuntime .NET 6 targets using dotnet
-      inputs:
-        targetType: 'inline'
-        # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
-        # which is inconsistent with the msbuild output path for the pre-.net6 targets
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
-        # and makes it harder to do the packing
-        #
-        # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
-        script: |
-          dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6  -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+          dotnet workload install android ios
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: MSBuild@1
-      displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
+      displayName: 'Restore NuGet Packages and create project.assets.json'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId=$(OrtPackageId)'
+        msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: MSBuild@1
-      displayName: 'Build C# for pre-.net6 targets'
+      displayName: 'Build C# bindings'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - ${{ if eq(parameters.DoEsrp, true) }}:
@@ -445,15 +430,6 @@ stages:
           DisplayName: 'ESRP - Sign C# dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
 
-    - task: MSBuild@1
-      displayName: Update projects.assets.json with combined list of all target frameworks
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=$(OrtPackageId)'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
     - task: MSBuild@1
       displayName: 'Build Nuget Packages'
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 0f6310724e9a1..dc41a2d398893 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.104
+      version: 1.0.107
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.104
+      version: 1.0.107
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 3b1fde6cb6e4f..404699f705344 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -169,7 +169,7 @@ jobs:
   - task: MSBuild@1
     displayName: 'Restore NuGet Packages'
     inputs:
-      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
       platform: 'Any CPU'
       configuration: '${{ parameters.BuildConfig }}'
       msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
@@ -178,7 +178,7 @@ jobs:
   - task: MSBuild@1
     displayName: 'Build C#'
     inputs:
-      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln'
       configuration: '${{ parameters.BuildConfig }}'
       platform: 'Any CPU'
       msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId)'
@@ -197,7 +197,7 @@ jobs:
           command: test
           projects: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj'
           configuration: '${{ parameters.BuildConfig }}'
-          arguments: '--configuration ${{ parameters.BuildConfig }} -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) --blame'
+          arguments: '--configuration ${{ parameters.BuildConfig }} -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IncludeMobileTargets=false --blame'
           workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
   - ${{ if eq(parameters.EnablePython, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 792e828c9a880..24e46066a1f10 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -222,44 +222,29 @@ stages:
         versionSpec: 6.2.1
 
     - task: PowerShell@2
-      displayName: Install .NET 6 workloads
+      displayName: Install mobile workloads
       inputs:
         targetType: 'inline'
         script: |
           dotnet workload install android
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
-    - task: PowerShell@2
-      displayName: Build Microsoft.ML.OnnxRuntime .NET 6 targets using dotnet
-      inputs:
-        targetType: 'inline'
-        # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
-        # which is inconsistent with the msbuild output path for the pre-.net6 targets
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
-        # and makes it harder to do the packing
-        #
-        # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
-        script: |
-          dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6  -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
     - task: MSBuild@1
-      displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
+      displayName: 'Restore NuGet Packages and create project.assets.json'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId=$(OrtPackageId)'
+        msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - task: MSBuild@1
-      displayName: 'Build C# for pre-.net6 targets'
+      displayName: 'Build C# bindings'
       inputs:
         solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
         platform: 'Any CPU'
         configuration: RelWithDebInfo
-        msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
+        msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}'
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - ${{ if eq(parameters.DoEsrp, true) }}:
@@ -269,15 +254,6 @@ stages:
           DisplayName: 'ESRP - Sign C# dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
 
-    - task: MSBuild@1
-      displayName: Update projects.assets.json with combined list of all target frameworks
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=$(OrtPackageId)'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
     - task: MSBuild@1
       displayName: 'Build Nuget Packages'
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 187c7656602f5..8c926619c797d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -161,11 +161,13 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
+    retryCountOnTaskFailure: 3
   - script: |
      npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    retryCountOnTaskFailure: 3
   - script: |
      npm test -- suite1 -e=edge -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
@@ -180,10 +182,12 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
+    retryCountOnTaskFailure: 3
   - script: |
      npm test -- --webgl-texture-pack-mode -b=webgl -e=edge
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests - WebGL: packed mode'
+    retryCountOnTaskFailure: 3
   - script: |
      npm test -- --wasm-enable-proxy -b=wasm -e=edge
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index f7b68551b9c50..df74e7e5599a8 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -557,7 +557,7 @@ def generate_files(line_list, args):
         files_list.append(
             "<file src="
             + '"'
-            + os.path.join(args.native_build_path, "nuget-staging/usr/local/lib", "libonnxruntime.so")
+            + os.path.join(args.native_build_path, "libonnxruntime.so")
             + '" target="runtimes\\linux-'
             + args.target_architecture
             + '\\native" />'
@@ -793,8 +793,10 @@ def generate_files(line_list, args):
                 "<file src=" + '"' + os.path.join(args.native_build_path, nuget_dependencies["tvm"]) + runtimes + " />"
             )
 
-        # Some tools to be packaged in nightly build only, should not be released
+        # Some tools to be packaged in nightly debug build only, should not be released
         # These are copied to the runtimes folder for convenience of loading with the dlls
+        # NOTE: nuget gives a spurious error on linux if these aren't in a separate directory to the library so
+        #       we add them to a tools folder for that reason.
         if (
             args.is_release_build.lower() != "true"
             and args.target_architecture == "x64"
@@ -804,7 +806,10 @@ def generate_files(line_list, args):
                 "<file src="
                 + '"'
                 + os.path.join(args.native_build_path, nuget_dependencies["onnxruntime_perf_test"])
-                + runtimes
+                + runtimes[:-1]
+                + "\\tools\\"
+                + nuget_dependencies["onnxruntime_perf_test"]
+                + '"'
                 + " />"
             )
 
@@ -817,7 +822,10 @@ def generate_files(line_list, args):
                 "<file src="
                 + '"'
                 + os.path.join(args.native_build_path, nuget_dependencies["onnx_test_runner"])
-                + runtimes
+                + runtimes[:-1]
+                + "\\tools\\"
+                + nuget_dependencies["onnx_test_runner"]
+                + '"'
                 + " />"
             )
 
@@ -871,7 +879,6 @@ def generate_files(line_list, args):
         os.system(copy_command + " " + source_props + " " + target_props)
         files_list.append("<file src=" + '"' + target_props + '" target="build\\native" />')
         if not is_snpe_package and not is_qnn_package:
-            files_list.append("<file src=" + '"' + target_props + '" target="build\\netstandard1.1" />')
             files_list.append("<file src=" + '"' + target_props + '" target="build\\netstandard2.0" />')
 
         # Process targets file
@@ -890,7 +897,6 @@ def generate_files(line_list, args):
         os.system(copy_command + " " + source_targets + " " + target_targets)
         files_list.append("<file src=" + '"' + target_targets + '" target="build\\native" />')
         if not is_snpe_package and not is_qnn_package:
-            files_list.append("<file src=" + '"' + target_targets + '" target="build\\netstandard1.1" />')
             files_list.append("<file src=" + '"' + target_targets + '" target="build\\netstandard2.0" />')
 
         # Process xamarin targets files