From a7a49189e8840c77d2333a07cfdf5e04f9faa733 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 3 Jun 2024 11:25:56 -0500
Subject: [PATCH 01/26] Suppress Eigen warning in
 onnxruntime/test/onnx/microbenchmark/eigen.cc. (#20892)

Fix ARM64 GCC build with `--build_micro_benchmarks`.
---
 onnxruntime/test/onnx/microbenchmark/eigen.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/onnxruntime/test/onnx/microbenchmark/eigen.cc b/onnxruntime/test/onnx/microbenchmark/eigen.cc
index 29894316edd01..230a57740d448 100644
--- a/onnxruntime/test/onnx/microbenchmark/eigen.cc
+++ b/onnxruntime/test/onnx/microbenchmark/eigen.cc
@@ -1,3 +1,5 @@
+#include "onnxruntime_config.h"
+
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #if __GNUC__ >= 6
@@ -6,6 +8,15 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wunused-result"
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
+// _deps/eigen-src/unsupported/Eigen/CXX11/../../../Eigen/src/Core/arch/NEON/PacketMath.h:1671:9:
+// error: ‘void* memcpy(void*, const void*, size_t)’ copying an object of non-trivial type ‘Eigen::internal::Packet4c’
+//   {aka ‘struct Eigen::internal::eigen_packet_wrapper<int, 2>’} from an array of ‘const int8_t’
+//   {aka ‘const signed char’} [-Werror=class-memaccess]
+#ifdef HAS_CLASS_MEMACCESS
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
 #elif defined(_MSC_VER)
 // build\windows\debug\external\eigen3\unsupported\eigen\cxx11\src/Tensor/Tensor.h(76):
 // warning C4554: '&': check operator precedence for possible error; use parentheses to clarify precedence

From d13cabf7f9350c296373bf2f1d59d2ebd84f71de Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 3 Jun 2024 10:14:08 -0700
Subject: [PATCH 02/26] Upgrade GCC and remove the dependency on GCC8's
 experimental std::filesystem implementation (#20893)

### Description
This PR upgrades CUDA 11 build pipelines' GCC version from 8 to 11.

### Motivation and Context

GCC8 has an experimental std::filesystem implementation which is not ABI
compatible with the formal one in later GCC releases. It didn't cause
trouble for us, however, ONNX community has encountered this issue much.
For example, https://github.com/onnx/onnx/issues/6047 . So this PR
increases the minimum supported GCC version from 8 to 9, and removes the
references to GCC's "stdc++fs" library. Please note we compile our code
on RHEL8 and RHEL8's libstdc++ doesn't have the fs library, which means
the binaries in ONNX Runtime's official packages always static link to
the fs library. It is just a matter of which version of the library, an
experimental one or a more mature one. And it is an implementation
detail that is not visible from outside. Anyway, a newer GCC is better.
It will give us the chance to use many C++20 features.

#### Why we were using GCC 8?
It is because all our Linux packages were built on RHEL8 or its
equivalents. The default GCC version in RHEL8 is 8. RHEL also provides
additional GCC versions from RH devtoolset. UBI8 is the abbreviation of
Red Hat Universal Base Image 8, which is the containerized RHEL8. UBI8
is free, which means it doesn't require a subscription(while RHEL does).
The only devtoolset that UBI8 provides is GCC 12, which is too new for
being used with CUDA 11.8. And our CUDA 11.8's build env is a docker
image from Nvidia that is based on UBI8.
#### How the problem is solved
Almalinux is an alternative to RHEL. Almalinux 8 provides GCC 11. And
the CUDA 11.8 docker image from Nvidia is open source, which means we
can rebuild the image based on Almalinux 8 to get GCC 11. I've done
this, but I cannot republish the new image due to various complicated
license restrictions. Therefore I put them at an internal location in
onnxruntimebuildcache.azurecr.io.
---
 cmake/CMakeLists.txt                          | 20 +-----
 cmake/onnxruntime_providers_migraphx.cmake    |  2 +-
 cmake/onnxruntime_providers_tensorrt.cmake    |  2 +-
 .../tensorrt_execution_provider_utils.h       |  3 +-
 .../azure-pipelines/bigmodels-ci-pipeline.yml |  2 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  4 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  4 +-
 .../py-package-test-pipeline.yml              |  2 +-
 .../stages/java-cuda-packaging-stage.yml      |  4 +-
 .../nuget-linux-cuda-packaging-stage.yml      | 29 ++------
 .../stages/py-cuda-packaging-stage.yml        |  4 +-
 .../templates/py-packaging-stage.yml          |  2 +-
 .../github/linux/docker/Dockerfile.aten_cpu   |  2 +-
 .../linux/docker/Dockerfile.manylinux2_28_cpu |  2 +-
 ...Dockerfile.manylinux2_28_training_cuda11_8 |  2 +-
 ...Dockerfile.manylinux2_28_training_cuda12_2 |  2 +-
 .../inference/aarch64/python/cpu/Dockerfile   |  2 +-
 .../x86_64/default/{gpu => cuda11}/Dockerfile | 12 ++--
 .../{gpu => cuda11}/scripts/install_deps.sh   |  4 +-
 .../x86_64/default/cuda12/Dockerfile          | 48 +++++++++++++
 .../default/cuda12/scripts/install_deps.sh    | 68 +++++++++++++++++++
 .../default/gpu/scripts/install_centos.sh     |  9 ---
 .../inference/x86_64/python/cpu/Dockerfile    |  2 +-
 23 files changed, 152 insertions(+), 79 deletions(-)
 rename tools/ci_build/github/linux/docker/inference/x86_64/default/{gpu => cuda11}/Dockerfile (77%)
 rename tools/ci_build/github/linux/docker/inference/x86_64/default/{gpu => cuda11}/scripts/install_deps.sh (89%)
 create mode 100644 tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
 create mode 100755 tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
 delete mode 100755 tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 969ed0b294a81..5200b447d553f 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -59,8 +59,8 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif()
 
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
-  message(FATAL_ERROR  "GCC version must be greater than or equal to 8")
+if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+  message(FATAL_ERROR  "GCC version must be greater than or equal to 9")
 endif()
 
 # Options
@@ -1300,12 +1300,6 @@ if (onnxruntime_USE_TVM)
   list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
 endif()
 
-# needs to link with stdc++fs in Linux
-if (UNIX AND "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
-  set(FS_STDLIB stdc++fs)
-endif()
-list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB})
-
 # onnxruntime-extensions
 if (onnxruntime_USE_EXTENSIONS)
   include(extensions)
@@ -1474,16 +1468,6 @@ if (onnxruntime_USE_CUDA)
   endif()
 endif()
 
-if (onnxruntime_USE_TENSORRT)
-  # needs to link with stdc++fs in Linux
-  if (UNIX)
-    if (NOT APPLE)
-      set(FS_STDLIB stdc++fs)
-    endif()
-  endif()
-  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB})
-endif()
-
 if (onnxruntime_USE_MIGRAPHX)
   if (WIN32)
     message(FATAL_ERROR "MIGraphX does not support build in Windows!")
diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
index 91ac66a40721d..01c4f8b2c8719 100644
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@@ -49,7 +49,7 @@
   target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare)
   set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
   set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections")
-  target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs)
+  target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp)
 
   include(CheckLibraryExists)
   check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC)
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 1e8f388548faf..e56de0c7124dc 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -206,7 +206,7 @@
   elseif(UNIX)
     set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
     set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections")
-    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp stdc++fs)
+    target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp)
   elseif(WIN32)
     set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def")
   else()
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index a54b728c17c44..df12d90338782 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -8,7 +8,6 @@
 #include <sstream>
 #include <iostream>
 #include <filesystem>
-#include <experimental/filesystem>
 #include "flatbuffers/idl.h"
 #include "ort_trt_int8_cal_table.fbs.h"
 #include <NvInferVersion.h>
@@ -16,7 +15,7 @@
 #include "core/common/path_string.h"
 #include "core/framework/murmurhash3.h"
 
-namespace fs = std::experimental::filesystem;
+namespace fs = std::filesystem;
 
 namespace onnxruntime {
 
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 67b56095962ab..2132fb6039872 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -49,7 +49,7 @@ resources:
 variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
-    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
   - name: linux_trt_version
     value: 10.0.1.6-1.cuda11.8
   - name: Repository
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 6c512666803ba..48a0b7d6c23b7 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -48,9 +48,9 @@ parameters:
 variables:
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index c7f6c41c8dcc0..133af76357543 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -38,9 +38,9 @@ parameters:
 variables:
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: 10.0.1.6-1.cuda11.8
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 3459ba6e48b23..63e70fa8e6488 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -54,7 +54,7 @@ stages:
       machine_pool: 'Onnxruntime-Linux-GPU'
       python_wheel_suffix: '_gpu'
       timeout: 480
-      docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+      docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
       trt_version: '10.0.1.6-1.cuda11.8'
       cuda_version: '11.8'
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index e4483b736c3e5..5f355478f2da0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -141,9 +141,9 @@ stages:
       value: false
     - name: docker_base_image
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1
     timeoutInMinutes: 60
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 7007c7636da6a..cca53e36ebab9 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -26,20 +26,14 @@ stages:
         value: '12'
     - name: CUDA_VERSION
       value: ${{ parameters.CudaVersion }}
-    - name: docker_base_image
-      ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
     steps:
     - template: ../templates/set-version-number-variables-step.yml
     - template: ../templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu
+        Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile
+        Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}
         DockerBuildArgs: "
         --build-arg BUILD_UID=$( id -u )
-        --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
         "
         Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}build
 
@@ -89,21 +83,15 @@ stages:
         value: 10.0.1.6-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
         value: 10.0.1.6-1.cuda12.4
-    - name: docker_base_image
-      ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
     steps:
     - checkout: self
       clean: true
       submodules: recursive
     - template: ../templates/get-docker-image-steps.yml
       parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu
+        Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile
+        Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}
         DockerBuildArgs: "
-          --build-arg BASEIMAGE=${{ variables.docker_base_image }}
           --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
           --build-arg BUILD_UID=$( id -u )
           "
@@ -164,11 +152,6 @@ stages:
         value: 10.0.1.6-1.cuda11.8
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
         value: 10.0.1.6-1.cuda12.4
-    - name: docker_base_image
-      ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-      ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
     steps:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
       submodules: false
@@ -182,8 +165,8 @@ stages:
     - template: ../templates/get-docker-image-steps.yml
       parameters:
         ScriptName: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/get_docker_image.py
-        Dockerfile: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile
-        Context: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu
+        Dockerfile: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile
+        Context: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}
         DockerBuildArgs: "--build-arg BASEIMAGE=${{ variables.docker_base_image }} --build-arg TRT_VERSION=${{ variables.linux_trt_version }} --build-arg BUILD_UID=$( id -u )"
         Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build
         UpdateDepsTxt: false
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index 45b124b60ab23..9c5282af47c5a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -78,8 +78,8 @@ stages:
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
             trt_version: 10.0.1.6-1.cuda11.8
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1
             trt_version: 10.0.1.6-1.cuda12.4
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 85f05eed27ae1..8ec1cff19e423 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -472,7 +472,7 @@ stages:
         parameters:
           arch: 'x86_64'
           machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           trt_version: '10.0.1.6-1.cuda11.8'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu b/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu
index 89a0a3c70eaa1..ad1db6a0305ec 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps_aten.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index cb42db3021f80..9bdc62ace4793 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 3eb6b506bebe3..ed920ea057393 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1
 ARG PYTHON_VERSION=3.9
 ARG TORCH_VERSION=2.0.0
 ARG OPSET_VERSION=17
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
index ee1fbb2d4f042..ba5cb245eb3e4 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1
 
 ARG PYTHON_VERSION=3.9
 ARG TORCH_VERSION=2.1.0
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index e3addf6e2e3a2..9a74788300ec9 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20240531.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
similarity index 77%
rename from tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile
rename to tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
index 0ca9dbe27af9d..051f9cc6a267f 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
@@ -2,10 +2,10 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-FROM $BASEIMAGE
-ARG TRT_VERSION
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20240531.1
 
+ARG TRT_VERSION
+RUN  rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "$TRT_VERSION" ]; then  \
     echo "TRT_VERSION is $TRT_VERSION" && \
@@ -31,13 +31,13 @@ else \
     echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
 fi
 
-ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
-
+ENV CUDAHOSTCXX /opt/rh/gcc-toolset-11/root/usr/bin/g++
 ADD scripts /tmp/scripts
-RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
 
 ARG BUILD_UID=1001
 ARG BUILD_USER=onnxruntimedev
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
similarity index 89%
rename from tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh
rename to tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
index eb6d3315b97ef..3c88c516bee4e 100755
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh
@@ -39,8 +39,8 @@ mkdir -p /tmp/src
 cd /tmp/src
 
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
new file mode 100644
index 0000000000000..a86b96b7adf42
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12_dotnet:20240531.1
+ARG TRT_VERSION
+
+#Install TensorRT only if TRT_VERSION is not empty
+RUN if [ -n "$TRT_VERSION" ]; then  \
+    echo "TRT_VERSION is $TRT_VERSION" && \
+    dnf -y install  \
+    libnvinfer10-${TRT_VERSION}  \
+    libnvinfer-headers-devel-${TRT_VERSION}  \
+    libnvinfer-devel-${TRT_VERSION}  \
+    libnvinfer-lean10-${TRT_VERSION}  \
+    libnvonnxparsers10-${TRT_VERSION}  \
+    libnvonnxparsers-devel-${TRT_VERSION}  \
+    libnvinfer-dispatch10-${TRT_VERSION}  \
+    libnvinfer-plugin10-${TRT_VERSION}  \
+    libnvinfer-vc-plugin10-${TRT_VERSION}  \
+    libnvinfer-bin-${TRT_VERSION}  \
+    libnvinfer-plugin10-${TRT_VERSION}  \
+    libnvinfer-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-vc-plugin-devel-${TRT_VERSION}  \
+    libnvinfer-lean-devel-${TRT_VERSION}  \
+    libnvinfer-dispatch-devel-${TRT_VERSION}  \
+    libnvinfer-headers-plugin-devel-${TRT_VERSION} && \
+    dnf clean dbcache ; \
+else \
+    echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
+fi
+
+
+
+ENV LANG=en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
+
+ENV CUDAHOSTCXX /opt/rh/gcc-toolset-12/root/usr/bin/g++
+ADD scripts /tmp/scripts
+RUN sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo && \
+	rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11 && cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
+ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
+ARG BUILD_UID=1001
+ARG BUILD_USER=onnxruntimedev
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
new file mode 100755
index 0000000000000..3c88c516bee4e
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -e -x
+
+# Download a file from internet
+function GetFile {
+  local uri=$1
+  local path=$2
+  local force=${3:-false}
+  local download_retries=${4:-5}
+  local retry_wait_time_seconds=${5:-30}
+
+  if [[ -f $path ]]; then
+    if [[ $force = false ]]; then
+      echo "File '$path' already exists. Skipping download"
+      return 0
+    else
+      rm -rf $path
+    fi
+  fi
+
+  if [[ -f $uri ]]; then
+    echo "'$uri' is a file path, copying file to '$path'"
+    cp $uri $path
+    return $?
+  fi
+
+  echo "Downloading $uri"
+  # Use aria2c if available, otherwise use curl
+  if command -v aria2c > /dev/null; then
+    aria2c -q -d $(dirname $path) -o $(basename $path) "$uri"
+  else
+    curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail
+  fi
+
+  return $?
+}
+mkdir -p /tmp/src
+
+cd /tmp/src
+
+echo "Installing cmake"
+GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz
+tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+
+echo "Installing Ninja"
+GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
+tar -zxf ninja-linux.tar.gz
+pushd ninja-1.10.0
+cmake -Bbuild-cmake -H.
+cmake --build build-cmake
+mv ./build-cmake/ninja /usr/bin
+popd
+
+echo "Installing Node.js"
+CPU_ARCH=`uname -m`
+if [[ "$CPU_ARCH" = "x86_64" ]]; then
+  NODEJS_ARCH=x64
+elif [[ "$CPU_ARCH" = "aarch64" ]]; then
+  NODEJS_ARCH=arm64
+else
+  NODEJS_ARCH=$CPU_ARCH
+fi
+# The EOL for nodejs v18.17.1 LTS is April 2025
+GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz
+tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr
+
+cd /
+rm -rf /tmp/src
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh
deleted file mode 100755
index 9647280da1aea..0000000000000
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -e -x
-if [ ! -f /etc/yum.repos.d/microsoft-prod.repo ]; then
-  os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
-  echo "installing for CentOS version : $os_major_version"
-  rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
-fi
-dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11
-locale
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index 3cec4ed6e9dce..2f568a78a13dc 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts

From ab9f15374659626a4fe5e38e53f80632a10dc82d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 3 Jun 2024 12:33:37 -0700
Subject: [PATCH 03/26] [js/web] allow build target for non dynamic import
 (#20898)

### Description
<!-- Describe your changes. -->

This PR allows to build ORT web to `ort{.all|.webgpu}.bundle.min.mjs`,
which does not have any dynamic import. This makes it possible to use
ort web via static import in service worker.

Fixes #20876
---
 js/package-lock.json                 | 197 +++++++++++++++++++++++++++
 js/package.json                      |   1 +
 js/web/lib/build-def.d.ts            |   4 +
 js/web/lib/wasm/wasm-utils-import.ts |  60 +++++---
 js/web/script/build.ts               | 140 ++++++++++++++++++-
 js/web/test/e2e/run-data.js          |   6 +
 6 files changed, 384 insertions(+), 24 deletions(-)

diff --git a/js/package-lock.json b/js/package-lock.json
index 1f8a6a09039d3..548706ee286b7 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -27,6 +27,7 @@
         "mocha": "^10.2.0",
         "npmlog": "^7.0.1",
         "prettier": "^3.0.3",
+        "terser": "^5.31.0",
         "typescript": "^5.2.2"
       }
     },
@@ -600,6 +601,64 @@
       "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==",
       "dev": true
     },
+    "node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
+      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/set-array": "^1.2.1",
+        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/set-array": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
+      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/source-map": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
+      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.25"
+      }
+    },
+    "node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.4.15",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz",
+      "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==",
+      "dev": true
+    },
+    "node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.25",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
+      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
     "node_modules/@jspm/core": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.0.1.tgz",
@@ -1288,6 +1347,12 @@
         "ieee754": "^1.2.1"
       }
     },
+    "node_modules/buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
+      "dev": true
+    },
     "node_modules/builtin-modules": {
       "version": "3.3.0",
       "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz",
@@ -1479,6 +1544,12 @@
         "color-support": "bin.js"
       }
     },
+    "node_modules/commander": {
+      "version": "2.20.3",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz",
+      "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
+      "dev": true
+    },
     "node_modules/comment-parser": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.0.tgz",
@@ -4172,6 +4243,25 @@
         "node": ">=8"
       }
     },
+    "node_modules/source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/source-map-support": {
+      "version": "0.5.21",
+      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
+      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
+      "dev": true,
+      "dependencies": {
+        "buffer-from": "^1.0.0",
+        "source-map": "^0.6.0"
+      }
+    },
     "node_modules/spdx-correct": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz",
@@ -4341,6 +4431,24 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/terser": {
+      "version": "5.31.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.0.tgz",
+      "integrity": "sha512-Q1JFAoUKE5IMfI4Z/lkE/E6+SwgzO+x4tq4v1AyBLRj8VSYvRO6A/rQrPg1yud4g0En9EKI1TvFRF2tQFcoUkg==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/source-map": "^0.3.3",
+        "acorn": "^8.8.2",
+        "commander": "^2.20.0",
+        "source-map-support": "~0.5.20"
+      },
+      "bin": {
+        "terser": "bin/terser"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
     "node_modules/text-table": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
@@ -5009,6 +5117,55 @@
       "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==",
       "dev": true
     },
+    "@jridgewell/gen-mapping": {
+      "version": "0.3.5",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz",
+      "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/set-array": "^1.2.1",
+        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/trace-mapping": "^0.3.24"
+      }
+    },
+    "@jridgewell/resolve-uri": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
+      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
+      "dev": true
+    },
+    "@jridgewell/set-array": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
+      "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
+      "dev": true
+    },
+    "@jridgewell/source-map": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
+      "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/gen-mapping": "^0.3.5",
+        "@jridgewell/trace-mapping": "^0.3.25"
+      }
+    },
+    "@jridgewell/sourcemap-codec": {
+      "version": "1.4.15",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz",
+      "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==",
+      "dev": true
+    },
+    "@jridgewell/trace-mapping": {
+      "version": "0.3.25",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
+      "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/resolve-uri": "^3.1.0",
+        "@jridgewell/sourcemap-codec": "^1.4.14"
+      }
+    },
     "@jspm/core": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.0.1.tgz",
@@ -5482,6 +5639,12 @@
         "ieee754": "^1.2.1"
       }
     },
+    "buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
+      "dev": true
+    },
     "builtin-modules": {
       "version": "3.3.0",
       "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz",
@@ -5613,6 +5776,12 @@
       "integrity": "sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==",
       "dev": true
     },
+    "commander": {
+      "version": "2.20.3",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz",
+      "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
+      "dev": true
+    },
     "comment-parser": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.0.tgz",
@@ -7603,6 +7772,22 @@
       "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==",
       "dev": true
     },
+    "source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "dev": true
+    },
+    "source-map-support": {
+      "version": "0.5.21",
+      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
+      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
+      "dev": true,
+      "requires": {
+        "buffer-from": "^1.0.0",
+        "source-map": "^0.6.0"
+      }
+    },
     "spdx-correct": {
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz",
@@ -7733,6 +7918,18 @@
       "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
       "dev": true
     },
+    "terser": {
+      "version": "5.31.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.0.tgz",
+      "integrity": "sha512-Q1JFAoUKE5IMfI4Z/lkE/E6+SwgzO+x4tq4v1AyBLRj8VSYvRO6A/rQrPg1yud4g0En9EKI1TvFRF2tQFcoUkg==",
+      "dev": true,
+      "requires": {
+        "@jridgewell/source-map": "^0.3.3",
+        "acorn": "^8.8.2",
+        "commander": "^2.20.0",
+        "source-map-support": "~0.5.20"
+      }
+    },
     "text-table": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
diff --git a/js/package.json b/js/package.json
index 63b7df6ed9de3..308d6931a927c 100644
--- a/js/package.json
+++ b/js/package.json
@@ -21,6 +21,7 @@
     "mocha": "^10.2.0",
     "npmlog": "^7.0.1",
     "prettier": "^3.0.3",
+    "terser": "^5.31.0",
     "typescript": "^5.2.2"
   },
   "scripts": {
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index 4f30e71d690a3..188aaebc7d187 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -32,6 +32,10 @@ interface BuildDefinitions {
    * defines whether to disable training APIs in WebAssembly backend.
    */
   readonly DISABLE_TRAINING: boolean;
+  /**
+   * defines whether to disable dynamic importing WASM module in the build.
+   */
+  readonly DISABLE_DYNAMIC_IMPORT: boolean;
 
   // #endregion
 
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index c14941ee6afbe..f80bd7195d456 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -121,12 +121,28 @@ export const importProxyWorker = async(): Promise<[undefined | string, Worker]>
   return [url, createProxyWorker!(url)];
 };
 
+/**
+ * The embedded WebAssembly module.
+ *
+ * This is only available in ESM and when embedding is not disabled.
+ */
+const embeddedWasmModule: EmscriptenModuleFactory<OrtWasmModule>|undefined =
+    BUILD_DEFS.IS_ESM && BUILD_DEFS.DISABLE_DYNAMIC_IMPORT ?
+    // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires
+    require(
+        !BUILD_DEFS.DISABLE_TRAINING ? '../../dist/ort-training-wasm-simd-threaded.mjs' :
+            !BUILD_DEFS.DISABLE_JSEP ? '../../dist/ort-wasm-simd-threaded.jsep.mjs' :
+                                       '../../dist/ort-wasm-simd-threaded.mjs')
+        .default :
+    undefined;
+
 /**
  * Import the WebAssembly module.
  *
  * This function will perform the following steps:
- * 1. If a preload is needed, it will preload the module and return the object URL.
- * 2. Otherwise, it will perform a dynamic import of the module.
+ * 1. If BUILD_DEFS.DISABLE_DYNAMIC_IMPORT is true, use the embedded module.
+ * 2. If a preload is needed, it will preload the module and return the object URL.
+ * 3. Otherwise, it will perform a dynamic import of the module.
  *
  * @returns - A promise that resolves to a tuple of 2 elements:
  *            - The object URL of the preloaded module, or undefined if no preload is needed.
@@ -135,22 +151,26 @@ export const importProxyWorker = async(): Promise<[undefined | string, Worker]>
 export const importWasmModule = async(
     urlOverride: string|undefined, prefixOverride: string|undefined,
     isMultiThreaded: boolean): Promise<[undefined | string, EmscriptenModuleFactory<OrtWasmModule>]> => {
-  const wasmModuleFilename = !BUILD_DEFS.DISABLE_TRAINING ? 'ort-training-wasm-simd-threaded.mjs' :
-      !BUILD_DEFS.DISABLE_JSEP                            ? 'ort-wasm-simd-threaded.jsep.mjs' :
-                                                            'ort-wasm-simd-threaded.mjs';
-  const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride);
-  // need to preload if all of the following conditions are met:
-  // 1. not in Node.js.
-  //    - Node.js does not have the same origin policy for creating workers.
-  // 2. multi-threaded is enabled.
-  //    - If multi-threaded is disabled, no worker will be created. So we don't need to preload the module.
-  // 3. the absolute URL is available.
-  //    - If the absolute URL is failed to be created, the origin cannot be determined. In this case, we will not
-  //    preload the module.
-  // 4. the worker URL is not from the same origin.
-  //    - If the worker URL is from the same origin, we can create the worker directly.
-  const needPreload = !isNode && isMultiThreaded && wasmModuleUrl && !isSameOrigin(wasmModuleUrl, prefixOverride);
-  const url =
-      needPreload ? (await preload(wasmModuleUrl)) : (wasmModuleUrl ?? fallbackUrl(wasmModuleFilename, prefixOverride));
-  return [needPreload ? url : undefined, await dynamicImportDefault<EmscriptenModuleFactory<OrtWasmModule>>(url)];
+  if (BUILD_DEFS.DISABLE_DYNAMIC_IMPORT) {
+    return [undefined, embeddedWasmModule!];
+  } else {
+    const wasmModuleFilename = !BUILD_DEFS.DISABLE_TRAINING ? 'ort-training-wasm-simd-threaded.mjs' :
+        !BUILD_DEFS.DISABLE_JSEP                            ? 'ort-wasm-simd-threaded.jsep.mjs' :
+                                                              'ort-wasm-simd-threaded.mjs';
+    const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride);
+    // need to preload if all of the following conditions are met:
+    // 1. not in Node.js.
+    //    - Node.js does not have the same origin policy for creating workers.
+    // 2. multi-threaded is enabled.
+    //    - If multi-threaded is disabled, no worker will be created. So we don't need to preload the module.
+    // 3. the absolute URL is available.
+    //    - If the absolute URL is failed to be created, the origin cannot be determined. In this case, we will not
+    //    preload the module.
+    // 4. the worker URL is not from the same origin.
+    //    - If the worker URL is from the same origin, we can create the worker directly.
+    const needPreload = !isNode && isMultiThreaded && wasmModuleUrl && !isSameOrigin(wasmModuleUrl, prefixOverride);
+    const url = needPreload ? (await preload(wasmModuleUrl)) :
+                              (wasmModuleUrl ?? fallbackUrl(wasmModuleFilename, prefixOverride));
+    return [needPreload ? url : undefined, await dynamicImportDefault<EmscriptenModuleFactory<OrtWasmModule>>(url)];
+  }
 };
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 7ef9bb6b70347..eba5efa3f11e0 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -57,6 +57,7 @@ const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.DISABLE_TRAINING': 'true',
+  'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'false',
 
   'BUILD_DEFS.IS_ESM': 'false',
   'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined',
@@ -76,7 +77,102 @@ interface OrtBuildOptions {
   readonly define?: Record<string, string>;
 }
 
-const esbuildAlreadyBuilt = new Map();
+const terserAlreadyBuilt = new Map();
+
+/**
+ * This function is only used to minify the Emscripten generated JS code. The ESBuild minify option is not able to
+ * tree-shake some unused code as expected. Specifically, there are 2 issues:
+ * 1. the use of `await import("module")`
+ * 2. the use of `await import("worker_threads")`, with top-level "await".
+ *
+ * The 2 code snippets mentioned above are guarded by feature checks to make sure they are only run in Node.js. However,
+ * ESBuild fails to tree-shake them and will include them in the final bundle. It will generate code like this:
+ *
+ * ```js
+ * // original code (example, not exact generated code)
+ * var isNode = typeof process !== 'undefined' && process.versions?.node;
+ * if (isNode) {
+ *   const {createRequire} = await import('module');
+ *   ...
+ * }
+ *
+ * // minimized code (with setting "define: {'process': 'undefined'}")
+ * var x=!0;if(x){const{createRequire:rt}=await import("module");...}
+ * ```
+ *
+ * The remaining dynamic import call makes trouble for further building steps. To solve this issue, we use Terser to
+ * minify the Emscripten generated JS code. Terser does more aggressive optimizations and is able to tree-shake the
+ * unused code with special configurations.
+ *
+ * We assume the minimized code does not contain any dynamic import calls.
+ */
+async function minifyWasmModuleJsForBrowser(filepath: string): Promise<string> {
+  const code = terserAlreadyBuilt.get(filepath);
+  if (code) {
+    return code;
+  }
+
+  const doMinify = (async () => {
+    const TIME_TAG = `BUILD:terserMinify:${filepath}`;
+    console.time(TIME_TAG);
+
+    const contents = await fs.readFile(filepath, {encoding: 'utf-8'});
+
+    // Find the first and the only occurrence of minified function implementation of "_emscripten_thread_set_strongref":
+    // ```js
+    // _emscripten_thread_set_strongref: (thread) => {
+    //   if (ENVIRONMENT_IS_NODE) {
+    //     PThread.pthreads[thread].ref();
+    //   }
+    // }
+    // ```
+    //
+    // It is minified to: (example)
+    // ```js
+    // function Pb(a){D&&N[a>>>0].ref()}
+    // ```
+
+    // The following code will look for the function name and mark the function call as pure, so that Terser will
+    // minify the code correctly.
+
+    const markedAsPure = [];
+    // First, try if we are working on the original (not minified) source file. This is when we are working with the
+    // debug build.
+    const isOriginal = contents.includes('PThread.pthreads[thread].ref()');
+    if (isOriginal) {
+      markedAsPure.push('PThread.pthreads[thread].ref');
+    } else {
+      // If it is not the original source file, we need to find the minified function call.
+      const matches = [...contents.matchAll(/\{[_a-zA-Z][_a-zA-Z0-9]*&&([_a-zA-Z][_a-zA-Z0-9]*\[.+?]\.ref)\(\)}/g)];
+      if (matches.length !== 1) {
+        throw new Error(`Unexpected number of matches for minified "PThread.pthreads[thread].ref()" in "${filepath}": ${
+            matches.length}.`);
+      }
+      // matches[0] is the first and the only match.
+      // matches[0][0] is the full matched string and matches[0][1] is the first capturing group.
+      markedAsPure.push(matches[0][1]);
+    }
+
+    const terser = await import('terser');
+    const result = await terser.minify(contents, {
+      module: true,
+      compress: {
+        passes: 2,
+        global_defs: {'process': undefined, 'globalThis.process': undefined},
+        pure_funcs: markedAsPure,
+      },
+    });
+
+    console.timeEnd(TIME_TAG);
+
+    return result.code!;
+  })();
+
+  terserAlreadyBuilt.set(filepath, doMinify);
+  return doMinify;
+}
+
+const esbuildAlreadyBuilt = new Map<string, string>();
 async function buildBundle(options: esbuild.BuildOptions) {
   // Skip if the same build options have been built before.
   const serializedOptions = JSON.stringify(options);
@@ -162,18 +258,31 @@ async function buildOrt({
   const platform = isNode ? 'node' : 'browser';
   const external =
       isNode ? ['onnxruntime-common'] : ['node:fs/promises', 'node:fs', 'node:os', 'module', 'worker_threads'];
+  const plugins: esbuild.Plugin[] = [];
   const defineOverride: Record<string, string> = {};
   if (!isNode) {
     defineOverride.process = 'undefined';
     defineOverride['globalThis.process'] = 'undefined';
   }
 
+  if (define['BUILD_DEFS.DISABLE_DYNAMIC_IMPORT'] === 'true') {
+    plugins.push({
+      name: 'emscripten-mjs-handler',
+      setup(build: esbuild.PluginBuild) {
+        build.onLoad(
+            {filter: /dist[\\/]ort-.*wasm.*\.mjs$/},
+            async args => ({contents: await minifyWasmModuleJsForBrowser(args.path)}));
+      }
+    });
+  }
+
   await buildBundle({
     entryPoints: ['web/lib/index.ts'],
     outfile: `web/dist/${outputName}${isProduction ? '.min' : ''}.${format === 'esm' ? 'mjs' : 'js'}`,
     platform,
     format,
     globalName: 'ort',
+    plugins,
     external,
     define: {...define, ...defineOverride},
     sourcemap: isProduction ? 'linked' : 'inline',
@@ -280,8 +389,8 @@ async function postProcess() {
         }
       }
       if (!found) {
-        if (file.includes('webgl')) {
-          // skip webgl
+        if (file.includes('.webgl.') || file.includes('.bundle.')) {
+          // skip webgl and bundle, they don't have dynamic import calls.
           continue;
         }
         throw new Error(`Dynamic import call not found in "${jsFilePath}". Should not happen.`);
@@ -363,7 +472,7 @@ async function validate() {
 
       // all files should contain the magic comment to ignore dynamic import calls.
       //
-      if (!file.includes('webgl') && !file.startsWith('ort.esm.')) {
+      if (!file.includes('.webgl.') && !file.includes('.bundle.')) {
         const contentToSearch = isMinified ? '/*webpackIgnore:true*/' : '/* webpackIgnore: true */';
         if (!content.includes(contentToSearch)) {
           throw new Error(`Validation failed: "${file}" does not contain magic comment.`);
@@ -457,17 +566,40 @@ async function main() {
   if (BUNDLE_MODE === 'prod') {
     // ort.all[.min].[m]js
     await addAllWebBuildTasks({outputName: 'ort.all'});
+    // ort.all.bundle.min.mjs
+    await buildOrt({
+      isProduction: true,
+      outputName: 'ort.all.bundle',
+      format: 'esm',
+      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'},
+    });
 
     // ort[.min].[m]js
     await addAllWebBuildTasks({
       outputName: 'ort',
       define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true'},
     });
+    // ort.bundle.min.mjs
+    await buildOrt({
+      isProduction: true,
+      outputName: 'ort.bundle',
+      format: 'esm',
+      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'},
+    });
+
     // ort.webgpu[.min].[m]js
     await addAllWebBuildTasks({
       outputName: 'ort.webgpu',
       define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'},
     });
+    // ort.webgpu.bundle.min.mjs
+    await buildOrt({
+      isProduction: true,
+      outputName: 'ort.webgpu.bundle',
+      format: 'esm',
+      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'},
+    });
+
     // ort.wasm[.min].[m]js
     await addAllWebBuildTasks({
       outputName: 'ort.wasm',
diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js
index 58371bafd276d..507192f29be9c 100644
--- a/js/web/test/e2e/run-data.js
+++ b/js/web/test/e2e/run-data.js
@@ -30,6 +30,12 @@ const BROWSER_TEST_CASES = [
   [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2', 'proxy=1']],  // wasm, 2 threads, proxy
   [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=1', 'proxy=1']],  // wasm, 1 thread, proxy
 
+  // ort.bundle.min.mjs
+  [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1']],             // 1 thread
+  [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2']],             // 2 threads
+  [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2', 'proxy=1']],  // 2 threads, proxy
+  [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1', 'proxy=1']],  // 1 thread, proxy
+
   // path override:
   // wasm, path override filenames for both mjs and wasm, same origin
   [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']],

From ae8df4db8f3422e4461375dc76f37c2875d7d301 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 3 Jun 2024 14:08:45 -0700
Subject: [PATCH 04/26] Split java's gradle  build and test (#20817)

### Description

This PR to allow `./gradlew cmakeCheck` failed on
Windows_Packaging_(CUDA|TensorRT) Job. This way, it will still generate
all nessary jar and pom file need for later stage to consume while
`./gradlew cmakeCheck`will be also run again in the
Windows_Packaging_(CUDA|TensorRT)_Testing stage.


### Motivation and Context
Reduce the time of All java packaging stages by 30+ min.
---
 .../stages/java-cuda-packaging-stage.yml      |  6 +--
 .../templates/make_java_win_binaries.yml      | 15 +++++--
 .../azure-pipelines/templates/win-ci.yml      | 39 +++++++++----------
 3 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index 5f355478f2da0..1d5b810dfe726 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -10,10 +10,8 @@ stages:
 - stage: Jar_Packaging_GPU
   dependsOn:
   - Linux_C_API_Packaging_GPU
-# Because Java Jar is published only after Windows Packaging GPU Testing stage we need to depend on the Testing stages
-# TODO: change Windows_Packaging_*_Testing to Windows_Packaging_* once we finish PRODUCT BACKLOG ITEM 34666
-  - Windows_Packaging_CUDA_Testing
-  - Windows_Packaging_TensorRT_Testing
+  - Windows_Packaging_CUDA
+  - Windows_Packaging_TensorRT
   - Download_Java_Tools
   jobs:
   - job: Jar_Packaging_GPU
diff --git a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
index 756a7a48343a3..9fa9f5e4a4869 100644
--- a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
@@ -3,16 +3,25 @@ parameters:
     type: string
   - name: java_artifact_id
     type: string
+  - name: buildOnly
+    type: boolean
 
 steps:
+    - task: CmdLine@2
+      displayName: 'Gradle cmakeCheck'
+      continueOnError:  ${{ parameters.buildOnly }}
+      inputs:
+        script: |
+          @echo on
+          call gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
+        workingDirectory: $(Build.SourcesDirectory)\java
+        failOnStderr: ${{ not(parameters.buildOnly) }}
+
     - task: CmdLine@2
       displayName: 'Add symbols and notices to Java'
       inputs:
         script: |
           @echo on
-          cd $(Build.SourcesDirectory)\java
-          call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
-          if %errorlevel% neq 0 exit /b %errorlevel%
           cd $(Build.BinariesDirectory)\RelWithDebInfo
           set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64
           mkdir %NATIVE_FOLDER%
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 35c23e311ed5d..c726054d8eb10 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -191,19 +191,24 @@ stages:
           createLogFile: true
 
       # For CPU job, tests are run in the same machine as building
+      - ${{ if eq(parameters.buildJava, 'true') }}:
+        - template: make_java_win_binaries.yml
+          parameters:
+            msbuildPlatform: ${{ parameters.msbuildPlatform }}
+            java_artifact_id: ${{ parameters.java_artifact_id }}
+            ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}:
+              buildOnly: false
+            # When it is a GPU build, we only assemble the java binaries, testing will be done in the later stage with GPU machine
+            ${{ else }}:
+              buildOnly: true
+
+        - task: PublishBuildArtifacts@1
+          displayName: 'Publish Java temp binaries'
+          inputs:
+            pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
+            artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
+        # All GPU builds will be tested in the next stage with GPU machine
       - ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}:
-        - ${{ if eq(parameters.buildJava, 'true') }}:
-          - template: make_java_win_binaries.yml
-            parameters:
-              msbuildPlatform: ${{ parameters.msbuildPlatform }}
-              java_artifact_id: ${{ parameters.java_artifact_id }}
-
-          - task: PublishBuildArtifacts@1
-            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-            displayName: 'Publish Java temp binaries'
-            inputs:
-              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
-              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
         - task: PythonScript@0
           displayName: 'test'
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
@@ -386,16 +391,10 @@ stages:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
             arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
             workingDirectory: '$(Build.BinariesDirectory)'
-
+# Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:
           - template: make_java_win_binaries.yml
             parameters:
               msbuildPlatform: ${{ parameters.msbuildPlatform }}
               java_artifact_id: ${{ parameters.java_artifact_id }}
-
-          - task: PublishBuildArtifacts@1
-            condition: and(succeeded(), eq('${{ parameters.buildJava}}', true))
-            displayName: 'Publish Java temp binaries'
-            inputs:
-              pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}'
-              artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}'
+              buildOnly: false
\ No newline at end of file

From c128132dd8aaabd9d132015f90be823b36bb0fec Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 4 Jun 2024 05:10:22 +0800
Subject: [PATCH 05/26] [WebNN EP] TFLite backend only supports Elu with
 default alpha (#20862)

---
 js/web/docs/webnn-operators.md                     |  2 +-
 .../webnn/builders/impl/activation_op_builder.cc   | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index bcabb6896f339..7ec3eda8ce24b 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -25,7 +25,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✗ | Only supports 3-D or 4-D input and 'W' (weight). |
 | Cos | ai.onnx(7+) | cos | ✗ | ✓ | |
 | Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
-| Elu | ai.onnx(7+) | elu | ✓ | ✓ | |
+| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
 | Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✗ | ✓ | |
 | Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✗ | ✓ | |
 | Exp | ai.onnx(7-12, 13+) | exp | ✗ | ✓ | |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
index 163c9b0fb91d3..af0f0133b497a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
@@ -20,7 +20,7 @@ class ActivationOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         WebnnDeviceType device_type, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type,
                               const logging::Logger& logger) const override;
 };
@@ -72,14 +72,24 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 // Operator support related.
 bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
                                             const Node& node,
-                                            WebnnDeviceType /* device_type */,
+                                            WebnnDeviceType device_type,
                                             const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
+  const auto& op_type = node.OpType();
 
   std::vector<int64_t> input_shape;
   if (!GetShape(*input_defs[0], input_shape, logger))
     return false;
 
+  if (op_type == "Elu" && device_type == WebnnDeviceType::CPU) {
+    NodeAttrHelper helper(node);
+    float alpha = helper.Get("alpha", 1.0f);
+    if (alpha != 1.0f) {
+      LOGS(logger, VERBOSE) << "WebNN CPU backend only supports Elu's alpha == 1.0";
+      return false;
+    }
+  }
+
   return true;
 }
 

From 9c6481fa2def0a0f78a0cf11acae9b5732d7c828 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 4 Jun 2024 05:12:11 +0800
Subject: [PATCH 06/26] [WebNN EP] Enable ArgMax and ArgMin for CPU backend
 (#20865)

WebNN TFLite backend supports ArgMax and ArgMin, but only supports
'select_last_index' value is 0.
---
 js/web/docs/webnn-operators.md                      |  4 ++--
 onnxruntime/core/providers/webnn/builders/helper.h  |  4 ++--
 .../webnn/builders/impl/argmax_min_op_builder.cc    | 13 +++++++++++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 7ec3eda8ce24b..1df40b71a00fa 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -13,8 +13,8 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 |:------:|:------:|:------:|:-:|:-:|:------|
 | Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | |
 | Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | |
-| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✗ | ✓ | |
-| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✗ | ✓ | |
+| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 |
+| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 |
 | AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
 | BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✗ | ✓ | Only supports 'training_mode' value is 0, one output |
 | Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✗ | ✓ | |
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 486f7f69be15c..7c84a14ba7d88 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -157,8 +157,8 @@ std::vector<std::vector<NodeIndex>> GetSupportedNodes(const GraphViewer& graph_v
 static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"Abs", {"abs", true}},
     {"Add", {"add", true}},
-    {"ArgMax", {"argMax", false}},
-    {"ArgMin", {"argMin", false}},
+    {"ArgMax", {"argMax", true}},
+    {"ArgMin", {"argMin", true}},
     {"AveragePool", {"averagePool2d", true}},
     {"BatchNormalization", {"batchNormalization", false}},
     {"Cast", {"cast", false}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 7926311f3c4e6..f8b77b6350a76 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -21,7 +21,7 @@ class ArgMaxMinOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         WebnnDeviceType device_type, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type,
                               const logging::Logger& logger) const override;
 };
@@ -68,7 +68,7 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 // Operator support related.
 bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
                                            const Node& node,
-                                           WebnnDeviceType /* device_type */,
+                                           WebnnDeviceType device_type,
                                            const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -76,6 +76,15 @@ bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia
   if (!GetShape(*input_defs[0], input_shape, logger))
     return false;
 
+  // WebNN CPU backend only supports select_last_index = 0.
+  if (device_type == WebnnDeviceType::CPU) {
+    NodeAttrHelper helper(node);
+    const auto select_last_index = helper.Get("select_last_index", 0);
+    if (select_last_index) {
+      LOGS(logger, VERBOSE) << "ArgMax/ArgMin with select_last_index = 1 is not supported on WebNN CPU backend.";
+      return false;
+    }
+  }
   return true;
 }
 

From 456ab09d179722b4742510e3886c8b4040816b1a Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 3 Jun 2024 14:22:22 -0700
Subject: [PATCH 07/26] Component Governance fix round 5 (#20905)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…over the case where there is only single repo checked out

### Description
adding $(Build.SourcesDirectory)/cmake/external/onnx/third_party to
cover the case where there is only single repo checked out


### Motivation and Context
Fix CG issue
https://aiinfra.visualstudio.com/Lotus/_componentGovernance/97926/alert/8862110?typeId=16576846
---
 .../templates/component-governance-component-detection-steps.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
index 7945de295f92c..62785b6413e6a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml
@@ -29,6 +29,7 @@ steps:
       ignoreDirectories:
         '$(Build.Repository.LocalPath)/cmake/external/emsdk/upstream/emscripten/tests,
         $(Build.Repository.LocalPath)/cmake/external/onnx/third_party,
+        $(Build.SourcesDirectory)/cmake/external/onnx/third_party,
         $(Build.Repository.LocalPath)/cmake/external/onnxruntime-extensions,
         $(Build.Repository.LocalPath)/js/react_native/e2e/node_modules,
         $(Build.SourcesDirectory)/onnxruntime-inference-examples,

From 94ce1209f9ce1f0efeb76f817fcb0efd63604889 Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Mon, 3 Jun 2024 14:41:39 -0700
Subject: [PATCH 08/26] Bug fix for gather fusion with on-device training
 (#20891)

### Description
Update the initializer that's added in GatherSliceToSplitFusion to use
the GenerateNodeArgName function, rather than the GenerateNodeName
function.

GenerateNodeName goes through all the nodes in the graph to see if the
given name is already used and generates a unique one if it has been
used. GenerateNodeArgName iterates through all the node args in the
graph to see if the given name is already used.

### Motivation and Context
* on-device training goes through a generate artifacts step, where
optimizations are applied, then, when the training artifact is loaded,
additional optimizations are applied. In the first round of
optimizations, a "splits" initializer is added for phi-3. With the
second round of optimizations, another "splits" initializer with
different dimensions and data is added. Since we call GenerateNodeName
func, the first splits initializer isn't found, causing a type error
where it claims the shape of splits does not match the TensorProto
shape.
---
 onnxruntime/core/optimizer/gather_fusion.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc
index 1f2b31526c6b8..2bde320786130 100644
--- a/onnxruntime/core/optimizer/gather_fusion.cc
+++ b/onnxruntime/core/optimizer/gather_fusion.cc
@@ -268,7 +268,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra
     }
 
     ONNX_NAMESPACE::TensorProto split_initializer_proto;
-    split_initializer_proto.set_name(graph.GenerateNodeName("splits"));
+    split_initializer_proto.set_name(graph.GenerateNodeArgName("splits"));
     split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
     split_initializer_proto.add_dims(static_cast<int64_t>(split_values.size()));
     split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end());

From 3c561c8b266704fe315c1cd3f24673f6ab27ce03 Mon Sep 17 00:00:00 2001
From: zhijiang <43435212+zhijxu-MS@users.noreply.github.com>
Date: Tue, 4 Jun 2024 09:22:10 +0800
Subject: [PATCH 09/26] fix bug (#20694)

when num of elem in tensor large than 2^32, then we can use cuda_long as
dtype of offset
---
 .../cuda/cu_inc/binary_elementwise_impl.cuh   | 248 +++++++++++-------
 .../cpu/math/element_wise_ops_test.cc         |  16 ++
 2 files changed, 163 insertions(+), 101 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
index a41888d0df48b..1469f55f0bfda 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
+#include <limits>
 #include <stdint.h>
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
@@ -11,7 +12,8 @@ namespace cuda {
 
 // broadcast by computing output coordinate from offset, using fast_divmod
 template <typename T, typename T1, typename T2, typename FuncT,
-  bool lhs_need_compute, bool rhs_need_compute, int NumThreadsPerBlock, int NumElementsPerThread>
+          bool lhs_need_compute, bool rhs_need_compute, int NumThreadsPerBlock, int NumElementsPerThread,
+          typename NumElemT>
 __global__ void _BinaryElementWise(
     int32_t output_rank,
     const TArray<int64_t> lhs_padded_strides,
@@ -21,19 +23,19 @@ __global__ void _BinaryElementWise(
     const TArray<fast_divmod> fdm_output_strides,
     T* output_data,
     const FuncT& functor,
-    CUDA_LONG N) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+    NumElemT N) {
+  NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   T1 lvalue[NumElementsPerThread];
   T2 rvalue[NumElementsPerThread];
 
-  CUDA_LONG id = start;
+  NumElemT id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      CUDA_LONG lhs_index = (lhs_need_compute ? 0 : id);
-      CUDA_LONG rhs_index = (rhs_need_compute ? 0 : id);
+      NumElemT lhs_index = (lhs_need_compute ? 0 : id);
+      NumElemT rhs_index = (rhs_need_compute ? 0 : id);
       // compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-      CUDA_LONG offset = id;
+      NumElemT offset = id;
 #pragma unroll
       for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) {
         if (dim >= output_rank) {
@@ -69,18 +71,19 @@ __global__ void _BinaryElementWise(
 }
 
 // for scalar broadcast or non-broadcast case
-template <bool IncL, bool IncR, typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
+template <bool IncL, bool IncR, typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock,
+          int NumElementsPerThread, typename NumElemT>
 __global__ void _BinaryElementWiseSimple(
     const T1* lhs_data,
     const T2* rhs_data,
     T* output_data,
     const FuncT func,
-    CUDA_LONG N) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+    NumElemT N) {
+  NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   T1 lvalue[NumElementsPerThread];
   T2 rvalue[NumElementsPerThread];
 
-  CUDA_LONG id = start;
+  NumElemT id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
@@ -103,23 +106,24 @@ __global__ void _BinaryElementWiseSimple(
 }
 
 // for rhs per-channel broadcast case
-template <typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
+template <typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread,
+          typename NumElemT>
 __global__ void _BinaryElementWiseRhsPerChannelBatch1(
     const T1* lhs_data,
     const T2* rhs_data,
     const fast_divmod fdm_H,
     T* output_data,
     FuncT func,
-    CUDA_LONG N) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+    NumElemT N) {
+  NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   T1 lvalue[NumElementsPerThread];
   T2 rvalue[NumElementsPerThread];
 
-  CUDA_LONG id = start;
+  NumElemT id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      CUDA_LONG rhs_id = fdm_H.div(id);
+      NumElemT rhs_id = fdm_H.div(id);
       lvalue[i] = lhs_data[id];
       rvalue[i] = rhs_data[rhs_id];
 
@@ -138,7 +142,8 @@ __global__ void _BinaryElementWiseRhsPerChannelBatch1(
   }
 }
 
-template <typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread>
+template <typename T, typename T1, typename T2, typename FuncT, int NumThreadsPerBlock, int NumElementsPerThread,
+          typename NumElemT>
 __global__ void _BinaryElementWiseRhsPerChannelBatchN(
     const T1* lhs_data,
     const T2* rhs_data,
@@ -146,16 +151,16 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN(
     const fast_divmod fdm_C,
     T* output_data,
     FuncT func,
-    CUDA_LONG N) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+    NumElemT N) {
+  NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   T1 lvalue[NumElementsPerThread];
   T2 rvalue[NumElementsPerThread];
 
-  CUDA_LONG id = start;
+  NumElemT id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      CUDA_LONG rhs_id = fdm_H.div(id);
+      NumElemT rhs_id = fdm_H.div(id);
       int q, r;
       fdm_C.divmod(rhs_id, q, r);
       rhs_id = r;
@@ -189,27 +194,34 @@ void BinaryElementWiseNoBroadcastImpl(
   if (count == 0)  // special case where there's a dim value of 0 in the output shape
     return;
 
-  #ifdef USE_ROCM
+#ifdef USE_ROCM
   const int num_elements_per_thread = 2;
   const int num_threads_per_block = 512;
-  #else
+#else
   const int num_elements_per_thread = GridDim::maxElementsPerThread;
   const int num_threads_per_block = GridDim::maxThreadsPerBlock;
-  #endif
+#endif
 
   int blocksPerGrid = static_cast<int>(CeilDiv(count, num_threads_per_block * num_elements_per_thread));
-  CUDA_LONG N = static_cast<CUDA_LONG>(count);
-  _BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-      lhs_data,
-      rhs_data,
-      output_data,
-      func,
-      N);
-
+#define FUNC_CALL(NumElemT)                                                                                        \
+  _BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT> \
+      <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(                                                       \
+          lhs_data,                                                                                                \
+          rhs_data,                                                                                                \
+          output_data,                                                                                             \
+          func,                                                                                                    \
+          static_cast<NumElemT>(N));
+  size_t N = static_cast<size_t>(count);
+  if (N > static_cast<size_t>(std::numeric_limits<CUDA_LONG>::max())) {
+    FUNC_CALL(size_t);
+  } else {
+    FUNC_CALL(CUDA_LONG);
+  }
+#undef FUNC_CALL
 }
 
-template <typename T, typename T1, typename T2, typename FuncT>
-void BinaryElementWiseImpl(
+template <typename T, typename T1, typename T2, typename FuncT, typename NumElemT>
+void _BinaryElementWiseImpl(
     cudaStream_t stream,
     int32_t output_rank_or_simple_broadcast,
     const TArray<int64_t>* lhs_padded_strides,
@@ -225,90 +237,124 @@ void BinaryElementWiseImpl(
   if (count == 0)  // special case where there's a dim value of 0 in the output shape
     return;
 
-  #ifdef USE_ROCM
+#ifdef USE_ROCM
   const int num_elements_per_thread = 2;
   const int num_threads_per_block = 512;
-  #else
+#else
   const int num_elements_per_thread = GridDim::maxElementsPerThread;
   const int num_threads_per_block = GridDim::maxThreadsPerBlock;
-  #endif
+#endif
 
   int blocksPerGrid = static_cast<int>(CeilDiv(count, num_threads_per_block * num_elements_per_thread));
-  CUDA_LONG N = static_cast<CUDA_LONG>(count);
+  NumElemT N = static_cast<NumElemT>(count);
   if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::NoBroadcast)) {
-    _BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        lhs_data,
-        rhs_data,
-        output_data,
-        func,
-        N);
+    _BinaryElementWiseSimple<true, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            lhs_data,
+            rhs_data,
+            output_data,
+            func,
+            N);
   } else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::LeftScalar)) {
-    _BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        lhs_data,
-        rhs_data,
-        output_data,
-        func,
-        N);
+    _BinaryElementWiseSimple<false, true, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            lhs_data,
+            rhs_data,
+            output_data,
+            func,
+            N);
   } else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightScalar)) {
-    _BinaryElementWiseSimple<true, false, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        lhs_data,
-        rhs_data,
-        output_data,
-        func,
-        N);
+    _BinaryElementWiseSimple<true, false, T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            lhs_data,
+            rhs_data,
+            output_data,
+            func,
+            N);
   } else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatch1)) {
-    _BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        lhs_data,
-        rhs_data,
-        fdm_H,
-        output_data,
-        func,
-        N);
+    _BinaryElementWiseRhsPerChannelBatch1<T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            lhs_data,
+            rhs_data,
+            fdm_H,
+            output_data,
+            func,
+            N);
   } else if (output_rank_or_simple_broadcast == static_cast<int32_t>(SimpleBroadcast::RightPerChannelBatchN)) {
-    _BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-        lhs_data,
-        rhs_data,
-        fdm_H,
-        fdm_C,
-        output_data,
-        func,
-        N);
+    _BinaryElementWiseRhsPerChannelBatchN<T, T1, T2, FuncT, num_threads_per_block, num_elements_per_thread, NumElemT>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            lhs_data,
+            rhs_data,
+            fdm_H,
+            fdm_C,
+            output_data,
+            func,
+            N);
   } else {
     if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size())
-      _BinaryElementWise<T, T1, T2, FuncT, true, true, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-          output_rank_or_simple_broadcast,
-          *lhs_padded_strides,
-          lhs_data,
-          *rhs_padded_strides,
-          rhs_data,
-          *fdm_output_strides,
-          output_data,
-          func,
-          N);
+      _BinaryElementWise<T, T1, T2, FuncT, true, true, num_threads_per_block, num_elements_per_thread, NumElemT>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              output_rank_or_simple_broadcast,
+              *lhs_padded_strides,
+              lhs_data,
+              *rhs_padded_strides,
+              rhs_data,
+              *fdm_output_strides,
+              output_data,
+              func,
+              N);
     else if (lhs_padded_strides && lhs_padded_strides->Size())
-      _BinaryElementWise<T, T1, T2, FuncT, true, false, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-          output_rank_or_simple_broadcast,
-          *lhs_padded_strides,
-          lhs_data,
-          TArray<int64_t>(), // rhs is not computed, so no need to deference rhs_padded_strides
-          rhs_data,
-          *fdm_output_strides,
-          output_data,
-          func,
-          N);
+      _BinaryElementWise<T, T1, T2, FuncT, true, false, num_threads_per_block, num_elements_per_thread, NumElemT>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              output_rank_or_simple_broadcast,
+              *lhs_padded_strides,
+              lhs_data,
+              TArray<int64_t>(),  // rhs is not computed, so no need to deference rhs_padded_strides
+              rhs_data,
+              *fdm_output_strides,
+              output_data,
+              func,
+              N);
     else if (rhs_padded_strides && rhs_padded_strides->Size())
-      _BinaryElementWise<T, T1, T2, FuncT, false, true, num_threads_per_block, num_elements_per_thread><<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-          output_rank_or_simple_broadcast,
-          TArray<int64_t>(), // lhs is not computed, so no need to deference lhs_padded_strides
-          lhs_data,
-          *rhs_padded_strides,
-          rhs_data,
-          *fdm_output_strides,
-          output_data,
-          func,
-          N);
+      _BinaryElementWise<T, T1, T2, FuncT, false, true, num_threads_per_block, num_elements_per_thread, NumElemT>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              output_rank_or_simple_broadcast,
+              TArray<int64_t>(),  // lhs is not computed, so no need to deference lhs_padded_strides
+              lhs_data,
+              *rhs_padded_strides,
+              rhs_data,
+              *fdm_output_strides,
+              output_data,
+              func,
+              N);
   }
 }
 
+template <typename T, typename T1, typename T2, typename FuncT>
+void BinaryElementWiseImpl(
+    cudaStream_t stream,
+    int32_t output_rank_or_simple_broadcast,
+    const TArray<int64_t>* lhs_padded_strides,
+    const T1* lhs_data,
+    const TArray<int64_t>* rhs_padded_strides,
+    const T2* rhs_data,
+    const TArray<fast_divmod>* fdm_output_strides,
+    const fast_divmod& fdm_H,
+    const fast_divmod& fdm_C,
+    T* output_data,
+    const FuncT& func,
+    size_t count) {
+#define FUNC_CALL(NumElemT)                                                                                      \
+  _BinaryElementWiseImpl<T, T1, T2, FuncT, NumElemT>(stream, output_rank_or_simple_broadcast,                    \
+                                                     lhs_padded_strides, lhs_data, rhs_padded_strides, rhs_data, \
+                                                     fdm_output_strides, fdm_H, fdm_C, output_data, func, static_cast<NumElemT>(count));
+
+  if (count > static_cast<size_t>(std::numeric_limits<CUDA_LONG>::max())) {
+    FUNC_CALL(size_t)
+  } else {
+    FUNC_CALL(CUDA_LONG)
+  }
+#undef FUNC_CALL
+}  // namespace cuda
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index fd9d222ec8904..eb3575f2cde88 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -484,6 +484,22 @@ TEST(MathOpTest, Add_Invalid_Broadcast) {
            {}, nullptr, &execution_providers);
 }
 
+// TEST(MathOpTest, Add_large_dimension) {
+//   OpTester test("Add");
+
+//   int64_t num_elem = static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1000;
+//   // int64_t num_elem = static_cast<int64_t>(200) + 1000;
+//   float input_scalar{4.0f};
+//   std::vector<float> input_sequence(num_elem, 0), output_sequence(num_elem, input_scalar);
+//   test.AddInput<float>("A", {num_elem}, input_sequence);
+//   test.AddInput<float>("B", {1}, {input_scalar});
+//   test.AddOutput<float>("C", {num_elem}, output_sequence);
+
+//   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+//   execution_providers.push_back(DefaultCudaExecutionProvider());
+//   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+// }
+
 TEST(MathOpTest, Sub_int32) {
   OpTester test("Sub");
   test.AddInput<int32_t>("A", {3}, {1, 4, 3});

From c5087b9b58f929c37f5798c266a8b622b09bed82 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 4 Jun 2024 10:19:32 +0800
Subject: [PATCH 10/26] Improve stable diffusion image parity test stability
 (#20904)

### Description
1. Add one image into whitelist, but if the image is hit, the pipeline
status is warning.
2. adjust the image parity test tolerance


### Motivation and Context
improve pipeline stability
---
 .../models/stable_diffusion/test/check_image.py      |  4 ++--
 .../github/azure-pipelines/bigmodels-ci-pipeline.yml | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
index 9a3615c1cbeca..86477a7e3168b 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py
@@ -64,11 +64,11 @@ def main():
     score = round(generate_score(image1, image2, cache_dir), 2)
     print("similarity Score: ", {score})
     if args.negative:
-        if score > 97:
+        if score > 95:
             print("Why generated this incorrect image")
             raise SystemExit(1)
     else:
-        if score < 97:
+        if score < 95:
             print(f"{image1} and {image2} are different")
             raise SystemExit(1)
         else:
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 2132fb6039872..f7500e0d805e2 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -172,6 +172,7 @@ stages:
       CLIP_MODEL_CACHE: $(Agent.TempDirectory)/clip_cache
       STABLE_DIFFUSION_MODEL_CACHE: $(Agent.TempDirectory)/stablediffusion_cache
       GenerateImage_DIR: $(Agent.TempDirectory)/images
+      hitAnother: 'False'
     workspace:
       clean: all
     pool: onnxruntime-Linux-GPU-A10-12G
@@ -243,7 +244,7 @@ stages:
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
           nvcr.io/nvidia/pytorch:22.11-py3 \
           bash -c '
-            set -ex; \
+            set -x; \
             python3 --version; \
             python3 -m pip install --upgrade pip; \
             pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \
@@ -252,13 +253,19 @@ stages:
             python3 -m pip install -r requirements.txt; \
             echo check demo_txt2image.py generate image; \
             python3 -u check_image.py --image1 astronaut_riding_error.png --image2 $image2 --cache_dir /model_cache --negative; \
+            if [ $? -ne 0 ]; then echo "Hit an unexpected image"; exit 1; fi; \
             popd ; \
             popd ; \
-            '
+            ' || ( echo "##vso[task.setvariable variable=hitAnother;]True" && exit 1 )
       displayName: 'Check if the generated image is wierd'
       workingDirectory: $(Build.SourcesDirectory)
+      # If the generate image hit another test image, make the job status as warning
       continueOnError: true
 
+    - bash: |
+        echo "You can use variables: $(hitAnother)"
+
+    # The step will execute if the gereneate image doesn't hit another test image
     - script: |
         docker run --rm --gpus all -v $PWD:/workspace \
           -v $(CLIP_MODEL_CACHE):/model_cache:rw  \
@@ -278,6 +285,7 @@ stages:
             '
       displayName: 'Check the generated image'
       workingDirectory: $(Build.SourcesDirectory)
+      condition: ne(variables.hitAnother, 'True')
 
 - stage: Llama2_ONNX_FP16
   dependsOn:

From 3dd6fcc089fc831b2e8d32d2dbc2932b242aba68 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 4 Jun 2024 10:15:20 -0700
Subject: [PATCH 11/26] Upgrade min ios version to 13.0 (#20773)

To align with Office and other MS products.
Office's support policy is:
"Office for iPad and iPhone is supported on the two most recent versions
of iOS and iPadOS. When a new version of iOS or iPadOS is released, the
Office Operating System requirement becomes the two most recent
versions: the new version of iOS or iPadOS and the previous version."
(from https://products.office.com/office-system-requirements)

The latest iOS version is 17. So they support both 17 and 16. Here I set
our min iOS version to 13 so that it will be a superset of what Office
supports.

This change would allow us using C++17's std::filesystem feature in the
core framework. The modifications were generated by running
```bash
 find . -type f -exec sed -i "s/apple_deploy_target[ =]12.0/apple_deploy_target=13.0/g"  {} \;
```

Cannot use 15.0 because otherwise iOS packaging would fail with:

```
/Users/runner/work/1/b/apple_framework/intermediates/iphoneos_arm64/Release/_deps/coremltools-src/mlmodel/src/MILBlob/Util/Span.hpp:288:9: error: cannot use 'throw' with exceptions disabled
        MILVerifyIsTrue(index < Size(), std::range_error, "index out of bounds");
```

The Google OSS libraries we use only officially support iOS 15+.
---
 js/react_native/e2e/ios/Podfile               |  2 +-
 js/react_native/ios/Podfile                   |  2 +-
 .../onnxruntime-react-native.podspec          |  2 +-
 .../github/apple/build_apple_framework.py     |  2 +-
 ...t_full_apple_framework_build_settings.json |  4 +--
 ...ult_full_ios_framework_build_settings.json |  4 +--
 ...t_mobile_ios_framework_build_settings.json | 34 -------------------
 ...training_ios_framework_build_settings.json |  4 +--
 ...os_simulator_framework_build_settings.json |  2 +-
 .../apple/use_ios_pods_with_custom_build.md   |  2 +-
 .../azure-pipelines/mac-ios-ci-pipeline.yml   |  2 +-
 .../mac-ios-packaging-pipeline.yml            |  4 ---
 .../azure-pipelines/post-merge-jobs.yml       |  4 +--
 .../stages/mac-ios-packaging-build-stage.yml  |  7 ----
 ...e2e_full_ios_framework_build_settings.json |  2 +-
 ...e_mobile_ios_framework_build_settings.json |  2 +-
 16 files changed, 17 insertions(+), 62 deletions(-)
 delete mode 100644 tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json

diff --git a/js/react_native/e2e/ios/Podfile b/js/react_native/e2e/ios/Podfile
index bec13598229cd..4bf19f965c553 100644
--- a/js/react_native/e2e/ios/Podfile
+++ b/js/react_native/e2e/ios/Podfile
@@ -1,7 +1,7 @@
 require_relative '../node_modules/react-native/scripts/react_native_pods'
 require_relative '../node_modules/@react-native-community/cli-platform-ios/native_modules'
 
-platform :ios, '12.4'
+platform :ios, '13.0'
 
 target 'OnnxruntimeModuleExample' do
   config = use_native_modules!
diff --git a/js/react_native/ios/Podfile b/js/react_native/ios/Podfile
index b5bd197d1ebd9..e3887e327b7af 100644
--- a/js/react_native/ios/Podfile
+++ b/js/react_native/ios/Podfile
@@ -1,7 +1,7 @@
 require_relative '../node_modules/react-native/scripts/react_native_pods'
 require_relative '../node_modules/@react-native-community/cli-platform-ios/native_modules'
 
-platform :ios, '12.4'
+platform :ios, '13.0'
 
 def shared
   config = use_native_modules!
diff --git a/js/react_native/onnxruntime-react-native.podspec b/js/react_native/onnxruntime-react-native.podspec
index 914a396be1f1d..50eba7dfaa1e0 100644
--- a/js/react_native/onnxruntime-react-native.podspec
+++ b/js/react_native/onnxruntime-react-native.podspec
@@ -15,7 +15,7 @@ Pod::Spec.new do |spec|
   spec.license              = package["license"]
   spec.authors              = package["author"]
 
-  spec.platforms            = { :ios => "12.4" }
+  spec.platforms            = { :ios => "13.0" }
   spec.source               = { :git => "https://github.com/Microsoft/onnxruntime.git", :tag => "rel-#{spec.version}" }
 
   spec.source_files         = "ios/*.{h,mm}"
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index e17bcd65d8814..3cd7a3af70622 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -187,7 +187,7 @@ def parse_args():
         os.path.basename(__file__),
         description="""Create iOS framework and podspec for one or more osx_archs (xcframework)
         and building properties specified in the given build config file, see
-        tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json for details.
+        tools/ci_build/github/apple/default_full_apple_framework_build_settings.json for details.
         The output of the final xcframework and podspec can be found under [build_dir]/framework_out.
         Please note, this building script will only work on macOS.
         """,
diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 04a73ae450e5f..84d7e355ed5b4 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -28,11 +28,11 @@
         ],
         "iphoneos": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ],
         "iphonesimulator": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ]
     }
 }
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index 4bc978956d7fc..e2d8f70c02cf3 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -24,13 +24,13 @@
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ],
         "iphonesimulator": [
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ],
         "macabi":[
             "--macos=Catalyst",
diff --git a/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json
deleted file mode 100644
index 2bdf8de24f53c..0000000000000
--- a/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-    "build_osx_archs": {
-        "iphoneos": [
-            "arm64"
-        ],
-        "iphonesimulator": [
-            "arm64",
-            "x86_64"
-        ]
-    },
-    "build_params": {
-        "base": [
-            "--parallel",
-            "--use_xcode",
-            "--build_apple_framework",
-            "--minimal_build=extended",
-            "--disable_rtti",
-            "--disable_ml_ops",
-            "--disable_exceptions",
-            "--enable_reduced_operator_type_support",
-            "--use_coreml",
-            "--skip_tests",
-            "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
-        ],
-        "iphoneos": [
-            "--ios",
-            "--apple_deploy_target=12.0"
-        ],
-        "iphonesimulator": [
-            "--ios",
-            "--apple_deploy_target=12.0"
-        ]
-    }
-}
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
index 2066af7843e0a..1d4a8c038c07b 100644
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
@@ -25,11 +25,11 @@
         ],
         "iphoneos": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ],
         "iphonesimulator": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ],
         "macosx": [
             "--macos=MacOSX",
diff --git a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
index 1a89d941e5e52..8f283173f1c6a 100644
--- a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
+++ b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json
@@ -16,7 +16,7 @@
     ],
     "iphonesimulator": [
       "--ios",
-      "--apple_deploy_target=12.0"
+      "--apple_deploy_target=13.0"
     ]
   }
 }
diff --git a/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md b/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md
index c8da2eff57c33..9e5215a2dc25a 100644
--- a/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md
+++ b/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md
@@ -18,7 +18,7 @@ Run the script:
 python3 tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
   --staging-dir /path/to/staging/dir \
   --include-ops-by-config /path/to/custom.config \
-  --build-settings-file tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json
+  --build-settings-file tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
 ```
 
 This will do a custom build and create the pod package files for it in `/path/to/staging/dir`.
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index 255531681b039..0a19312790a98 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -58,7 +58,7 @@ jobs:
                 --ios \
                 --apple_sysroot iphonesimulator  \
                 --osx_arch x86_64 \
-                --apple_deploy_target 12.0 \
+                --apple_deploy_target=13.0 \
                 --use_xcode \
                 --config RelWithDebInfo \
                 --build_apple_framework \
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 881023e1c1186..c209e20adc131 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -89,10 +89,6 @@ stages:
       displayName: "Set common variables"
       name: SetCommonVariables
 
-- template: templates/stages/mac-ios-packaging-build-stage.yml
-  parameters:
-    packageVariant: Mobile
-
 - template: templates/stages/mac-ios-packaging-build-stage.yml
   parameters:
     packageVariant: Full
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 38dc53cb5daf2..6901dcb7b68df 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -445,14 +445,14 @@ stages:
         python tools/ci_build/github/apple/build_apple_framework.py \
           --build_dir "$(Build.BinariesDirectory)/ios_framework" \
           --build_dynamic_framework \
-          tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json
+          tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
       displayName: "Build iOS dynamic framework"
 
     - script: |
         python tools/ci_build/github/apple/test_apple_packages.py \
           --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \
           --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
-          --variant Mobile
+          --variant Full
       displayName: "Test pod with iOS framework"
 
 - stage: IosMinimalTrainingBuild
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 2c9f968380a38..a1ae63e606526 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -2,7 +2,6 @@ parameters:
 - name: packageVariant
   type: string
   values:
-  - Mobile
   - Full
   - Training
 
@@ -22,12 +21,6 @@ stages:
       xcodeVersion: "14.2"
       ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
 
-      ${{ if eq(parameters.packageVariant, 'Mobile') }}:
-        buildSettingsFile: "tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json"
-        optionalIncludeOpsByConfigOption: "--include-ops-by-config tools/ci_build/github/android/mobile_package.required_operators.config"
-        cPodName: onnxruntime-mobile-c
-        objcPodName: onnxruntime-mobile-objc
-
       ${{ if eq(parameters.packageVariant, 'Full') }}:
         buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
         cPodName: onnxruntime-c
diff --git a/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json b/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json
index 78de7edb5ec29..a1266a80d1cd9 100644
--- a/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json
@@ -14,7 +14,7 @@
         ],
         "iphonesimulator": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ]
     }
 }
diff --git a/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json b/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json
index 3d80231393cc6..73ff98f921482 100644
--- a/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json
+++ b/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json
@@ -19,7 +19,7 @@
         ],
         "iphonesimulator": [
             "--ios",
-            "--apple_deploy_target=12.0"
+            "--apple_deploy_target=13.0"
         ]
     }
 }

From 51bc53580dab20c52b41c4c615293c14f404e732 Mon Sep 17 00:00:00 2001
From: liqun Fu <liqfu@microsoft.com>
Date: Tue, 4 Jun 2024 11:06:28 -0700
Subject: [PATCH 12/26] Update to onnx 1.16.1 (#20702)

---
 cgmanifests/generated/cgmanifest.json         |   4 +-
 cmake/deps.txt                                |   2 +-
 cmake/external/onnx                           |   2 +-
 cmake/patches/onnx/onnx.patch                 | 100 ++----------------
 .../models/llama/requirements.txt             |   4 +-
 .../transformers/models/phi2/requirements.txt |   2 +-
 .../models/whisper/requirements.txt           |   2 +-
 onnxruntime/test/python/requirements.txt      |   2 +-
 .../templates/download-deps.yml               |   4 +-
 .../python/cpu/scripts/requirements.txt       |   2 +-
 .../python/cpu/scripts/requirements.txt       |   2 +-
 .../python/cuda/scripts/requirements.txt      |   2 +-
 .../docker/scripts/manylinux/requirements.txt |   2 +-
 .../linux/docker/scripts/requirements.txt     |   2 +-
 14 files changed, 22 insertions(+), 110 deletions(-)

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index d728ae797429e..78db7d735dad9 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -26,7 +26,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "990217f043af7222348ca8f0301e17fa7b841781",
+          "commitHash": "595228d99e3977ac27cb79d5963adda262af99ad",
           "repositoryUrl": "https://github.com/onnx/onnx.git"
         },
         "comments": "git submodule at cmake/external/onnx"
@@ -216,7 +216,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "eb43908b02a296ea0594432f06e9d3fac288d672",
+          "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index d4d19dea08c8b..88c1881ad82fb 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -36,7 +36,7 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
-onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
 #use the latest commit of 10.0-GA
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
diff --git a/cmake/external/onnx b/cmake/external/onnx
index 990217f043af7..595228d99e397 160000
--- a/cmake/external/onnx
+++ b/cmake/external/onnx
@@ -1 +1 @@
-Subproject commit 990217f043af7222348ca8f0301e17fa7b841781
+Subproject commit 595228d99e3977ac27cb79d5963adda262af99ad
diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch
index fe8d6622bcc0e..162d33581a5ca 100644
--- a/cmake/patches/onnx/onnx.patch
+++ b/cmake/patches/onnx/onnx.patch
@@ -36,15 +36,15 @@ index b847798e..a6c31904 100644
 --- a/onnx/common/file_utils.h
 +++ b/onnx/common/file_utils.h
 @@ -6,7 +6,6 @@
- 
+
  #pragma once
- 
+
 -#include <filesystem>
  #include <fstream>
  #include <string>
- 
+
 @@ -17,8 +16,7 @@ namespace ONNX_NAMESPACE {
- 
+
  template <typename T>
  void LoadProtoFromPath(const std::string proto_path, T& proto) {
 -  std::filesystem::path proto_u8_path = std::filesystem::u8path(proto_path);
@@ -53,42 +53,6 @@ index b847798e..a6c31904 100644
    if (!proto_stream.good()) {
      fail_check("Unable to open proto file: ", proto_path, ". Please check if it is a valid proto. ");
    }
-diff --git a/onnx/defs/quantization/defs.cc b/onnx/defs/quantization/defs.cc
-index 70b4a4db..98c11545 100644
---- a/onnx/defs/quantization/defs.cc
-+++ b/onnx/defs/quantization/defs.cc
-@@ -200,6 +200,9 @@ ONNX_OPERATOR_SET_SCHEMA(
-         .SetDoc(DequantizeLinear_ver21_doc)
-         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-           propagateElemTypeFromInputToOutput(ctx, 1, 0);
-+          if (!hasInputShape(ctx, 0)) {
-+            return;
-+          }
-           auto& input_shape = getInputShape(ctx, 0);
-           updateOutputShape(ctx, 0, input_shape);
-         }));
-diff --git a/onnx/defs/quantization/old.cc b/onnx/defs/quantization/old.cc
-index 3f2d6384..d2f7cfd8 100644
---- a/onnx/defs/quantization/old.cc
-+++ b/onnx/defs/quantization/old.cc
-@@ -130,6 +130,9 @@ ONNX_OPERATOR_SET_SCHEMA(
-         .SetDoc(DequantizeLinear_ver19_doc)
-         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-           propagateElemTypeFromInputToOutput(ctx, 1, 0);
-+          if (!hasInputShape(ctx, 0)) {
-+            return;
-+          }
-           auto& input_shape = getInputShape(ctx, 0);
-           updateOutputShape(ctx, 0, input_shape);
-         }));
-@@ -181,7 +184,6 @@ ONNX_OPERATOR_SET_SCHEMA(
-           if (!hasInputShape(ctx, 0)) {
-             return;
-           }
--
-           auto& input_shape = getInputShape(ctx, 0);
-           updateOutputShape(ctx, 0, input_shape);
-         }));
 diff --git a/onnx/onnx_pb.h b/onnx/onnx_pb.h
 index 0aab3e26..398ac2d6 100644
 --- a/onnx/onnx_pb.h
@@ -96,7 +60,7 @@ index 0aab3e26..398ac2d6 100644
 @@ -47,10 +47,28 @@
  #define ONNX_API ONNX_IMPORT
  #endif
- 
+
 +#if defined(__GNUC__)
 +#pragma GCC diagnostic push
 +
@@ -116,61 +80,9 @@ index 0aab3e26..398ac2d6 100644
  #else
  #include "onnx/onnx.pb.h"
  #endif
- 
+
 +#if defined(__GNUC__)
 +#pragma GCC diagnostic pop
 +#endif
 +
  #endif // ! ONNX_ONNX_PB_H
-diff --git a/onnx/shape_inference/implementation.cc b/onnx/shape_inference/implementation.cc
-index fab1faf2..8723dcd4 100644
---- a/onnx/shape_inference/implementation.cc
-+++ b/onnx/shape_inference/implementation.cc
-@@ -488,29 +488,29 @@ class ShapeInferenceImplBase {
-           ProcessCall(n, *(iter->second), ctx);
-         } else {
-           has_unsupported_op = true;
-+          return;
-         }
-       } else {
-         has_unsupported_op = true;
-+        return;
-       }
--      if (!has_unsupported_op) {
--        for (int i = 0; i < n.output_size(); ++i) {
--          // skip type and shape propagation for missing optional outputs.
--          if (!n.output(i).empty())
--            UpdateType(n.output(i), ctx.getOutputType(i));
--        }
--        // Constant values are tracked to improve inference/checking for subsequent nodes.
--        ProcessConstant(n);
--        // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed
--        // to improve inference/checking for subsequent nodes.
--        if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) {
--          if (generated_shape_data_by_name == nullptr) {
--            fail_shape_inference(
--                "Container for generated shape data cannot be nullptr when enable_data_propagation option is set.");
--          }
--          DataPropagationContextImpl data_propagation_ctx(
--              n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name);
--          schema->GetDataPropagationFunction()(data_propagation_ctx);
-+      for (int i = 0; i < n.output_size(); ++i) {
-+        // skip type and shape propagation for missing optional outputs.
-+        if (!n.output(i).empty())
-+          UpdateType(n.output(i), ctx.getOutputType(i));
-+      }
-+      // Constant values are tracked to improve inference/checking for subsequent nodes.
-+      ProcessConstant(n);
-+      // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed
-+      // to improve inference/checking for subsequent nodes.
-+      if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) {
-+        if (generated_shape_data_by_name == nullptr) {
-+          fail_shape_inference(
-+              "Container for generated shape data cannot be nullptr when enable_data_propagation option is set.");
-         }
-+        DataPropagationContextImpl data_propagation_ctx(
-+            n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name);
-+        schema->GetDataPropagationFunction()(data_propagation_ctx);
-       }
-     }
-     ONNX_CATCH(const ONNX_NAMESPACE::InferenceError& ex) {
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index ce4b3f6a09ba5..388025165f814 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,7 +1,7 @@
 optimum>=1.14.1
 transformers>=4.33.2,<= 4.37.2
 torch>=2.2.0
-onnx==1.16.0
+onnx==1.16.1
 datasets>=2.8.0
 protobuf==3.20.2
-psutil
\ No newline at end of file
+psutil
diff --git a/onnxruntime/python/tools/transformers/models/phi2/requirements.txt b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt
index 0b2ea0df93a96..c82022e798482 100644
--- a/onnxruntime/python/tools/transformers/models/phi2/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt
@@ -1,3 +1,3 @@
-onnx==1.16.0
+onnx==1.16.1
 transformers>=4.36.2
 onnxscript>=0.1.0.dev20240126
diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
index a722c13e80766..689b14ea9a684 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt
@@ -7,7 +7,7 @@ soundfile
 librosa
 optimum
 onnxruntime-extensions>=0.9.0
-onnx==1.16.0
+onnx==1.16.1
 protobuf==3.20.2
 numpy==1.23.3
 psutil
diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt
index 5d8e356d0fc07..741c411ce55a0 100644
--- a/onnxruntime/test/python/requirements.txt
+++ b/onnxruntime/test/python/requirements.txt
@@ -1,2 +1,2 @@
-onnx==1.16.0
+onnx==1.16.1
 pytest
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 9d0a8b42a21ca..e7b230008dad4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.155
+      version: 1.0.156
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.155
+      version: 1.0.156
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 8f56ee18ccd24..cc47718f78a46 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -5,7 +5,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.16.0
+onnx==1.16.1
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt
index 8f56ee18ccd24..cc47718f78a46 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt
@@ -5,7 +5,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.16.0
+onnx==1.16.1
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt
index 8f56ee18ccd24..cc47718f78a46 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt
@@ -5,7 +5,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.16.0
+onnx==1.16.1
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 80eccb68ebebb..bdae9d72a1a63 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -5,7 +5,7 @@ mypy
 pytest
 setuptools>=68.2.2
 wheel
-onnx==1.16.0
+onnx==1.16.1
 protobuf==4.21.12
 sympy==1.12
 flatbuffers
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index e20e433cd33c6..3e619ea3dfb56 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -6,7 +6,7 @@ mypy
 pytest
 setuptools==69.0.3
 wheel==0.42.0
-onnx==1.16.0
+onnx==1.16.1
 argparse
 sympy==1.12
 flatbuffers

From 6dfdef7782e8865f6ecde51f2ee0d4b0dcf46ac6 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 4 Jun 2024 12:08:04 -0700
Subject: [PATCH 13/26] update stable diffusion demo requirements (#20914)

### Description
Update docker and package version for stable diffusion demo.

### Motivation and Context
Update onnx to 1.16 for security
---
 .../transformers/models/stable_diffusion/README.md     | 10 ++++++----
 .../models/stable_diffusion/requirements-cuda11.txt    |  2 +-
 .../models/stable_diffusion/requirements-cuda12.txt    |  2 +-
 .../models/stable_diffusion/requirements.txt           |  8 ++++----
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index 8607485bc265b..9c1c31626066d 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -36,16 +36,18 @@ cd onnxruntime
 Install nvidia-docker using [these instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
 
 ```
-docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:23.10-py3 /bin/bash
+docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.04-py3 /bin/bash
 ```
 
 #### Build onnxruntime from source
+The cuDNN in the container might not be compatible with official onnxruntime-gpu package, it is recommended to build from source instead.
+
 After launching the docker, you can build and install onnxruntime-gpu wheel like the following.
 ```
-export CUDACXX=/usr/local/cuda-12.2/bin/nvcc
+export CUDACXX=/usr/local/cuda/bin/nvcc
 git config --global --add safe.directory '*'
-sh build.sh --config Release  --build_shared_lib --parallel --use_cuda --cuda_version 12.2 \
-            --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/lib/x86_64-linux-gnu/ --build_wheel --skip_tests \
+sh build.sh --config Release  --build_shared_lib --parallel --use_cuda --cuda_version 12.4 \
+            --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --build_wheel --skip_tests \
             --use_tensorrt --tensorrt_home /usr/src/tensorrt \
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF \
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 \
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt
index 447cb54f98ed2..dc6592fc2fa54 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt
@@ -6,7 +6,7 @@ onnxruntime-gpu>=1.16.2
 py3nvml
 
 # The version of cuda-python shall be compatible with installed CUDA version.
-# For example, if your CUDA version is 12.1, you can install cuda-python 12.1.
+# For demo of TensorRT excution provider and TensortRT.
 cuda-python==11.8.0
 
 # For windows, cuda-python need the following
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt
index 1ff0e3c1cf5af..4aa88cdf92309 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt
@@ -6,7 +6,7 @@
 py3nvml
 
 # The version of cuda-python shall be compatible with installed CUDA version.
-# For example, if your CUDA version is 12.1, you can install cuda-python 12.1.
+# For demo of TensorRT excution provider and TensortRT.
 cuda-python>=12.1.0
 
 # For windows, cuda-python need the following
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
index 0798b659306b5..de242e77cdb2e 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt
@@ -1,8 +1,8 @@
-diffusers==0.24.0
-transformers==4.38.0
+diffusers==0.28.0
+transformers==4.41.2
 numpy>=1.24.1
 accelerate
-onnx==1.14.1
+onnx==1.16.0
 coloredlogs
 packaging
 # Use newer version of protobuf might cause crash
@@ -11,7 +11,7 @@ psutil
 sympy
 controlnet_aux==0.0.7
 # The following are for SDXL
-optimum==1.14.1
+optimum==1.20.0
 safetensors
 invisible_watermark
 # newer version of opencv-python migth encounter module 'cv2.dnn' has no attribute 'DictValue' error

From 5faeaf6437a87ddabeb435bd51d1a3791e4c3f64 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 4 Jun 2024 13:54:49 -0700
Subject: [PATCH 14/26] Remove failOnStderr from Gradle cmakeCheck (#20919)

### Description
Remove failOnStderr from Gradle cmakeCheck


### Motivation and Context
The Gradle is still using the deprecated API
---
 .../github/azure-pipelines/templates/make_java_win_binaries.yml  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
index 9fa9f5e4a4869..6c77678ce15d8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml
@@ -15,7 +15,6 @@ steps:
           @echo on
           call gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo
         workingDirectory: $(Build.SourcesDirectory)\java
-        failOnStderr: ${{ not(parameters.buildOnly) }}
 
     - task: CmdLine@2
       displayName: 'Add symbols and notices to Java'

From 3ecb01233789d4057a902d3e559ac6a9b410cbed Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Tue, 4 Jun 2024 14:44:40 -0700
Subject: [PATCH 15/26] [CPU EP] Add blocked quantization to DequantizeLinear
 op kernel (#20901)

### Description
Added blocked quantization to DequantizeLinear op kernel. All existing
[input types and output
types](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftdequantizelinear)
are supported. All axes are supported.

The implementation in the PR is naive - single thread and scalar
instructions. Multi-threading and vector instructions are planned in the
future based on the needs.


### Motivation and Context
onnx introduced blocked quantization in opset 21 for
[DequantizeLinear](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftdequantizelinear).
This PR adds the spec support in onnx runtime.
---
 .../cpu/quantization/quantize_linear.cc       | 360 ++++++---
 .../cpu/tensor/quantize_linear_test.cc        | 755 ++++++++++++++++++
 2 files changed, 1024 insertions(+), 91 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
index 05dea2a05c97b..91e21b3690b27 100644
--- a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
+++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
@@ -25,10 +25,7 @@ class DequantizeLinear final : public OpKernel {
       block_size_ = 0;
     }
 
-    // TODO(adrianlizarraga): Support the block_size attribute added in opset 21.
-    if (block_size_ != 0) {
-      ORT_THROW("DequantizeLinear does not yet support the 'block_size' attribute.");
-    }
+    ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -71,31 +68,55 @@ static void PrepareForQDQ(const TensorShape& input_shape,
                           const Tensor& scale,
                           const Tensor* zero_point_ptr,
                           int64_t axis,
-                          int64_t& quant_block_count,  // A "quant block" is a block of elems with the same scale/zp
-                          int64_t& axis_dim_val,
-                          int64_t& quant_block_size) {
+                          int64_t quant_block_size,
+                          int64_t& process_block_count,
+                          int64_t& broadcast_dim,
+                          int64_t& process_block_size) {
   if (IsScalarOr1ElementVector(&scale)) {  // per-tensor QuantizeLinear/DequantizeLinear
-    quant_block_count = 1;
-    axis_dim_val = 1;
-    quant_block_size = static_cast<size_t>(input_shape.Size());
+    process_block_count = 1;
+    broadcast_dim = 1;
+    process_block_size = static_cast<size_t>(input_shape.Size());
 
     // enforce that zero point are scalars
     ORT_ENFORCE(zero_point_ptr == nullptr || IsScalarOr1ElementVector(zero_point_ptr),
                 "x_zero_point must be null or a scalar or 1D tensor or size 1.");
-  } else {  // per-channel QuantizeLinear/DequantizeLinear
+    ORT_ENFORCE(quant_block_size == 0, "block_size must be 0 for per-tensor quantization.");
+  } else {  // per-axis or blocked QuantizeLinear/DequantizeLinear
     const int64_t axis_no_neg = HandleNegativeAxis(axis, input_shape.NumDimensions());
-    quant_block_count = input_shape.SizeToDimension(onnxruntime::narrow<size_t>(axis_no_neg));
-    axis_dim_val = input_shape[onnxruntime::narrow<size_t>(axis_no_neg)];
-    quant_block_size = input_shape.SizeFromDimension(SafeInt<size_t>(axis_no_neg) + 1);
+    process_block_count = input_shape.SizeToDimension(onnxruntime::narrow<size_t>(axis_no_neg));
+    broadcast_dim = input_shape[onnxruntime::narrow<size_t>(axis_no_neg)];
+    process_block_size = input_shape.SizeFromDimension(SafeInt<size_t>(axis_no_neg) + 1);
 
     // if an axis was specified, ensure the scale and zero point are compatible
-    ORT_ENFORCE(scale.Shape().NumDimensions() == 1 && scale.Shape()[0] == axis_dim_val,
-                "scale must be 1D tensor with size ",
-                axis_dim_val);
-    ORT_ENFORCE(zero_point_ptr == nullptr ||
-                    (zero_point_ptr->Shape().NumDimensions() == 1 && zero_point_ptr->Shape()[0] == axis_dim_val),
-                "x_zero_point must be null or 1D tensor with size ",
-                axis_dim_val);
+    if (quant_block_size) {  // blocked quantization
+      ORT_ENFORCE(scale.Shape().NumDimensions() == input_shape.NumDimensions(),
+                  "x_scale and x must have the same rank for blocked quantization");
+      ORT_ENFORCE(zero_point_ptr == nullptr || zero_point_ptr->Shape().NumDimensions() == input_shape.NumDimensions(),
+                  "x_zero_point must be null or have the same rank as x for blocked quantization");
+
+      for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) {
+        if (i == SafeInt<size_t>(axis_no_neg)) {
+          ORT_ENFORCE(scale.Shape()[i] == (input_shape[i] + quant_block_size - 1) / quant_block_size,
+                      "x_scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization");
+        } else {
+          ORT_ENFORCE(scale.Shape()[i] == input_shape[i],
+                      "x_scale and x must have the same shape despite the quantize axis for blocked quantization");
+        }
+
+        if (zero_point_ptr) {
+          ORT_ENFORCE(zero_point_ptr->Shape()[i] == scale.Shape()[i],
+                      "x_zero_point and x_scale must have the same shape for blocked quantization");
+        }
+      }
+    } else {  // per-axis quantization
+      ORT_ENFORCE(scale.Shape().NumDimensions() == 1 && scale.Shape()[0] == broadcast_dim,
+                  "For per axis quantization, scale must be 1D tensor with size ",
+                  broadcast_dim);
+      ORT_ENFORCE(zero_point_ptr == nullptr || (zero_point_ptr->Shape().NumDimensions() == 1 &&
+                                                zero_point_ptr->Shape()[0] == broadcast_dim),
+                  "For per axis quantization, x_zero_point must be null or 1D tensor with size ",
+                  broadcast_dim);
+    }
   }
 }
 
@@ -244,66 +265,198 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
 }  // namespace contrib
 #endif  // !defined(DISABLE_CONTRIB_OPS)
 
+template <typename T, typename OutT, bool is_4bit>
+struct DequantizeLinearApply;
+
+// The dimensions before quantize axis and after quantize axis can be flattened.
+// After flattening, the tensor can be represented by a rank-3 tensor.
+// If the quantization happens on the first or last axis, the flattened tensor is
+// effectively rank-2.
+// For per tensor quantization, the tensor is effectively rank-1.
 template <typename T, typename OutT>
-struct DequantizeLinearApply {
-  void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, OutT* output,
-          const T* zero_point) {
-    for (size_t n = 0; n < static_cast<size_t>(N); n++) {
-      for (size_t bd = 0; bd < static_cast<size_t>(axis_dim_val); bd++) {
-        auto zp = zero_point ? static_cast<int32_t>(zero_point[bd]) : 0;
-        auto sc = static_cast<float>(scale[bd]);
-        for (size_t bs = 0; bs < static_cast<size_t>(quant_block_size); bs++) {
+struct DequantizeLinearApply<T, OutT, false> {
+  /**
+   * @brief Calculate per-tensor/layer or per-axis quantization of DequantizeLinear on the
+   *        flattened tensors.
+   * @param[in]    M                      size of dimensions before the quantize axis
+   * @param[in]    K                      dimension on the quantize axis
+   * @param[in]    N                      size of dimensions after the quantize axis
+   * @param[in]    input                  1D array of flattened [D0, ..., Di, ..., Dn]
+   * @param[in]    scale                  scalar for per-tensor/layer quantization and 1D array [Di]
+   *                                      for per-axis quantization. i is the quantize axis.
+   * @param[out]   output                 same shape as input
+   * @param[in]    zero_point             same shape as scale
+   */
+  void op(size_t M, size_t K, size_t N, const T* input,
+          const OutT* scale, OutT* output, const T* zero_point) {
+    for (size_t m = 0; m < M; m++) {
+      for (size_t k = 0; k < K; k++) {
+        auto zp = zero_point ? static_cast<int32_t>(zero_point[k]) : 0;
+        auto sc = static_cast<float>(scale[k]);
+        for (size_t n = 0; n < N; n++) {
           *output++ = static_cast<OutT>(static_cast<float>(static_cast<int32_t>(*input++) - zp) * sc);
         }
       }
     }
   }
+
+  /**
+   * @brief Calculate blocked quantization of DequantizeLinear on the flattened tensors.
+   *        TODO(fajin): add mlas kernel to utilize multithreading, refer MlasDequantizeBlockwise.
+   * @param[in]    M                      size of dimensions before the quantize axis
+   * @param[in]    K                      dimension of the quantize axis
+   * @param[in]    N                      size of dimensions after the quantize axis
+   * @param[in]    quant_block_size       quantize block size along the quantize axis
+   * @param[in]    input                  1D array of flattened [D0, ..., Di, ..., Dn]
+   * @param[in]    scale                  1D array of flattened [D0, ..., ceil(Di/quant_block_size), ..., Dn].
+   *                                      i is the quantize axis.
+   * @param[out]   output                 same shape as input
+   * @param[in]    zero_point             same shape as scale
+   */
+  void op(size_t M, size_t K, size_t N, size_t quant_block_size,
+          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+    if (zero_point) {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) {
+            // within the quantize block, the zero point and scale are the same.
+            for (size_t bs = 0; bs < N; bs++) {
+              auto zp = static_cast<int32_t>(zero_point[bs]);
+              auto sc = static_cast<float>(scale[bs]);
+              *output++ = static_cast<OutT>(static_cast<float>(static_cast<int32_t>(*input++) - zp) * sc);
+            }
+          }
+
+          // move to the next quantize block
+          zero_point += N;
+          scale += N;
+        }
+      }
+    } else {
+      for (size_t m = 0; m < M; m++) {
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) {
+            // within the quantize block, the zero point and scale are the same.
+            for (size_t bs = 0; bs < N; bs++) {
+              auto sc = static_cast<float>(scale[bs]);
+              *output++ = static_cast<OutT>(static_cast<float>(static_cast<int32_t>(*input++)) * sc);
+            }
+          }
+
+          // move to the next quantize block
+          scale += N;
+        }
+      }
+    }
+  }
 };
 
-#define DEQUANTIZE_LINEAR_APPLY_INT4(T)                                                                   \
-  template <typename OutT>                                                                                \
-  struct DequantizeLinearApply<T, OutT> {                                                                 \
-    void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, \
-            OutT* output, const T* zero_point) {                                                          \
-      size_t input_index = 0;                                                                             \
-      for (size_t n = 0; n < static_cast<size_t>(N); n++) {                                               \
-        for (size_t bd = 0; bd < static_cast<size_t>(axis_dim_val); bd++) {                               \
-          size_t bd_i = bd >> 1;  /*bd / 2*/                                                              \
-          size_t bd_j = bd & 0x1; /*bd % 2*/                                                              \
-          auto zp = zero_point ? static_cast<int32_t>(zero_point[bd_i].GetElem(bd_j)) : 0;                \
-          auto sc = static_cast<float>(scale[bd]);                                                        \
-          for (size_t bs = 0; bs < static_cast<size_t>(quant_block_size); bs++) {                         \
-            size_t input_i = input_index >> 1;                                                            \
-            size_t input_j = input_index & 0x1;                                                           \
-            int32_t val = static_cast<int32_t>(input[input_i].GetElem(input_j));                          \
-            *output++ = static_cast<OutT>(static_cast<float>(val - zp) * sc);                             \
-            input_index += 1;                                                                             \
-          }                                                                                               \
-        }                                                                                                 \
-      }                                                                                                   \
-      assert(input_index == static_cast<size_t>(N * axis_dim_val * quant_block_size));                    \
-    }                                                                                                     \
-  };
+template <typename T, typename OutT>
+struct DequantizeLinearApply<T, OutT, true> {
+  // per-tensor/layer or per-axis quantization
+  void op(size_t M, size_t K, size_t N,
+          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+    size_t input_index = 0;
+
+    for (size_t m = 0; m < M; m++) {
+      for (size_t bd = 0; bd < K; bd++) {
+        size_t bd_i = bd >> 1;  /*bd / 2*/
+        size_t bd_j = bd & 0x1; /*bd % 2*/
+        auto zp = zero_point ? static_cast<int32_t>(zero_point[bd_i].GetElem(bd_j)) : 0;
+        auto sc = static_cast<float>(scale[bd]);
+
+        for (size_t bs = 0; bs < N; bs++) {
+          size_t input_i = input_index >> 1;
+          size_t input_j = input_index & 0x1;
+          int32_t val = static_cast<int32_t>(input[input_i].GetElem(input_j));
+          *output++ = static_cast<OutT>(static_cast<float>(val - zp) * sc);
+          input_index += 1;
+        }
+      }
+    }
+
+    assert(input_index == M * K * N);
+  }
+
+  // Blocked quantization
+  // TODO(fajin) : add mlas kernel to utilize multithreading, refer MlasDequantizeBlockwise.
+  void op(size_t M, size_t K, size_t N, size_t quant_block_size,
+          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+    size_t input_index = 0;
+
+    if (zero_point) {
+      size_t zp_index = 0;
+
+      for (size_t n = 0; n < M; n++) {
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) {
+            auto q_zp_index = zp_index;
+            for (size_t bs = 0; bs < N; ++bs, ++input_index, ++q_zp_index) {
+              auto zp = static_cast<int32_t>(zero_point[q_zp_index >> 1].GetElem(q_zp_index & 0x1));
+              auto sc = static_cast<float>(scale[bs]);
+
+              int32_t val = static_cast<int32_t>(input[input_index >> 1].GetElem(input_index & 0x1));
+              *output++ = static_cast<OutT>(static_cast<float>(val - zp) * sc);
+            }
+          }
+
+          scale += N;
+          zp_index += N;
+        }
+      }
+    } else {
+      for (size_t n = 0; n < M; n++) {
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) {
+            for (size_t bs = 0; bs < N; ++bs, ++input_index) {
+              auto sc = static_cast<float>(scale[bs]);
+
+              int32_t val = static_cast<int32_t>(input[input_index >> 1].GetElem(input_index & 0x1));
+              *output++ = static_cast<OutT>(static_cast<float>(val) * sc);
+            }
+          }
+
+          scale += N;
+        }
+      }
+    }
 
-DEQUANTIZE_LINEAR_APPLY_INT4(Int4x2);
-DEQUANTIZE_LINEAR_APPLY_INT4(UInt4x2);
+    assert(input_index == M * K * N);
+  }
+};
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 
-#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T)                                                                 \
-  template <typename OutT>                                                                                \
-  struct DequantizeLinearApply<T, OutT> {                                                                 \
-    void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, \
-            OutT* output, const T*) {                                                                     \
-      for (size_t n = 0; n < static_cast<size_t>(N); n++) {                                               \
-        for (size_t bd = 0; bd < static_cast<size_t>(axis_dim_val); bd++) {                               \
-          auto sc = scale[bd];                                                                            \
-          for (size_t bs = 0; bs < static_cast<size_t>(quant_block_size); bs++, input++) {                \
-            *output++ = static_cast<OutT>(input->ToFloat() * sc);                                         \
-          }                                                                                               \
-        }                                                                                                 \
-      }                                                                                                   \
-    }                                                                                                     \
+#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T)                                                       \
+  template <typename OutT>                                                                      \
+  struct DequantizeLinearApply<T, OutT, false> {                                                \
+    /* Per-tensor/layer or per-axis quantization */                                             \
+    void op(size_t M, size_t K, size_t N,                                                       \
+            const T* input, const OutT* scale, OutT* output, const T*) {                        \
+      for (size_t m = 0; m < M; m++) {                                                          \
+        for (size_t bd = 0; bd < K; bd++) {                                                     \
+          auto sc = scale[bd];                                                                  \
+          for (size_t bs = 0; bs < N; bs++, input++) {                                          \
+            *output++ = static_cast<OutT>(input->ToFloat() * sc);                               \
+          }                                                                                     \
+        }                                                                                       \
+      }                                                                                         \
+    }                                                                                           \
+    /* Blocked quantization */                                                                  \
+    void op(size_t M, size_t K, size_t N, size_t quant_block_size,                              \
+            const T* input, const OutT* scale, OutT* output, const T*) {                        \
+      for (size_t m = 0; m < M; m++) {                                                          \
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {                                   \
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { \
+            for (size_t bs = 0; bs < N; bs++, input++) {                                        \
+              auto sc = static_cast<float>(scale[bs]);                                          \
+              *output++ = static_cast<OutT>(input->ToFloat() * sc);                             \
+            }                                                                                   \
+          }                                                                                     \
+          scale += N;                                                                           \
+        }                                                                                       \
+      }                                                                                         \
+    }                                                                                           \
   };
 
 DEQUANTIZE_LINEAR_APPLY_FLOAT8(Float8E4M3FN)
@@ -323,11 +476,12 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   const auto& x_shape = x.Shape();
   auto& y = *ctx->Output(0, x_shape);
 
-  int64_t N;
-  int64_t axis_dim_val;
-  int64_t quant_block_size;
+  int64_t process_block_count;
+  int64_t broadcast_dim;
+  int64_t process_block_size;
 
-  PrepareForQDQ(x.Shape(), x_scale, x_zero_point, axis_, N, axis_dim_val, quant_block_size);
+  PrepareForQDQ(x.Shape(), x_scale, x_zero_point, axis_, block_size_,
+                process_block_count, broadcast_dim, process_block_size);
 
   const T* zero_point = x_zero_point ? x_zero_point->Data<T>() : nullptr;
 
@@ -345,15 +499,38 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
 
   const auto to = x_scale.GetElementType();
   const T* input = x.Data<T>();
+  constexpr bool is_4bit = boost::mp11::mp_contains<TypeList<Int4x2, UInt4x2>, T>::value;
 
   if (to == ONNX_NAMESPACE::TensorProto::FLOAT) {
     const float* scale = x_scale.Data<float>();
     float* output = y.MutableData<float>();
-    DequantizeLinearApply<T, float>().op(N, axis_dim_val, quant_block_size, input, scale, output, zero_point);
+    if (block_size_) {
+      DequantizeLinearApply<T, float, is_4bit>().op(static_cast<size_t>(process_block_count),
+                                                    static_cast<size_t>(broadcast_dim),
+                                                    static_cast<size_t>(process_block_size),
+                                                    static_cast<size_t>(block_size_),
+                                                    input, scale, output, zero_point);
+    } else {
+      DequantizeLinearApply<T, float, is_4bit>().op(static_cast<size_t>(process_block_count),
+                                                    static_cast<size_t>(broadcast_dim),
+                                                    static_cast<size_t>(process_block_size),
+                                                    input, scale, output, zero_point);
+    }
   } else if (to == ONNX_NAMESPACE::TensorProto::FLOAT16) {
     const MLFloat16* scale = x_scale.Data<MLFloat16>();
     MLFloat16* output = y.MutableData<MLFloat16>();
-    DequantizeLinearApply<T, MLFloat16>().op(N, axis_dim_val, quant_block_size, input, scale, output, zero_point);
+    if (block_size_) {
+      DequantizeLinearApply<T, MLFloat16, is_4bit>().op(static_cast<size_t>(process_block_count),
+                                                        static_cast<size_t>(broadcast_dim),
+                                                        static_cast<size_t>(process_block_size),
+                                                        static_cast<size_t>(block_size_),
+                                                        input, scale, output, zero_point);
+    } else {
+      DequantizeLinearApply<T, MLFloat16, is_4bit>().op(static_cast<size_t>(process_block_count),
+                                                        static_cast<size_t>(broadcast_dim),
+                                                        static_cast<size_t>(process_block_size),
+                                                        input, scale, output, zero_point);
+    }
   } else if (to == ONNX_NAMESPACE::TensorProto::BFLOAT16) {
     ORT_THROW("DequantizeLinear into BFLOAT16 is not implemented yet.");
   } else {
@@ -524,14 +701,14 @@ void ParQuantizeLinear(const InputType* Input,
 }
 
 template <typename T, typename InT>
-void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const T* zero_point, T* output, int64_t N,
-                 int64_t axis_dim_val, int64_t quant_block_size, bool saturate) {
-  for (size_t n = 0; n < static_cast<size_t>(N); n++) {
-    for (size_t bd = 0; bd < static_cast<size_t>(axis_dim_val); bd++) {
-      ParQuantizeLinear(input, output, static_cast<size_t>(quant_block_size), scale[bd], bd, zero_point, saturate,
-                        ctx->GetOperatorThreadPool());
-      input += quant_block_size;
-      output += quant_block_size;
+void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const T* zero_point, T* output,
+                 int64_t process_block_count, int64_t broadcast_dim, int64_t process_block_size, bool saturate) {
+  for (size_t n = 0; n < static_cast<size_t>(process_block_count); n++) {
+    for (size_t bd = 0; bd < static_cast<size_t>(broadcast_dim); bd++) {
+      ParQuantizeLinear(input, output, static_cast<size_t>(process_block_size), scale[bd], bd, zero_point,
+                        saturate, ctx->GetOperatorThreadPool());
+      input += process_block_size;
+      output += process_block_size;
     }
   }
 }
@@ -611,20 +788,21 @@ Status QuantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   const auto& x_shape = x.Shape();
   auto& y = *ctx->Output(0, x_shape);
 
-  int64_t N;
-  int64_t axis_dim_val;
-  int64_t quant_block_size;
-  PrepareForQDQ(x.Shape(), y_scale, y_zero_point, axis_, N, axis_dim_val, quant_block_size);
+  int64_t process_block_count;
+  int64_t broadcast_dim;
+  int64_t process_block_size;
+  PrepareForQDQ(x.Shape(), y_scale, y_zero_point, axis_, block_size_,
+                process_block_count, broadcast_dim, process_block_size);
 
   const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
   T* output = y.MutableData<T>();
 
   if (x.IsDataType<float>()) {
-    ComputeLoop<T, float>(ctx, x.Data<float>(), y_scale.Data<float>(), zero_point, output, N, axis_dim_val,
-                          quant_block_size, saturate_);
+    ComputeLoop<T, float>(ctx, x.Data<float>(), y_scale.Data<float>(), zero_point, output,
+                          process_block_count, broadcast_dim, process_block_size, saturate_);
   } else if (x.IsDataType<MLFloat16>()) {
-    ComputeLoop<T, MLFloat16>(ctx, x.Data<MLFloat16>(), y_scale.Data<MLFloat16>(), zero_point, output, N,
-                              axis_dim_val, quant_block_size, saturate_);
+    ComputeLoop<T, MLFloat16>(ctx, x.Data<MLFloat16>(), y_scale.Data<MLFloat16>(), zero_point, output,
+                              process_block_count, broadcast_dim, process_block_size, saturate_);
   } else {
     ORT_THROW("Unsupported input type.");
   }
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 5eeda5a3b8949..054dcfc75b92e 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -794,5 +794,760 @@ TEST(QuantizeLinearOpMLFloat16Test, Float8) {
 
 #endif
 
+namespace blocked_dequantization {
+
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
+                                                          int64_t scale_block_count,
+                                                          int64_t zero_point_block_count) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{2, 4};
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  SessionOptions so;
+  std::vector<std::string> log_msgs;  // redirect error messages
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+  so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
+                                const char* logid, const char* code_location, const char* message) {
+    ORT_UNUSED_PARAMETER(severity);
+    ORT_UNUSED_PARAMETER(category);
+    ORT_UNUSED_PARAMETER(logid);
+    ORT_UNUSED_PARAMETER(code_location);
+    std::vector<std::string>* v_ptr = reinterpret_cast<std::vector<std::string>*>(param);
+    std::vector<std::string>& msg_vector = *v_ptr;
+    msg_vector.push_back(std::string(message));
+  };
+  so.user_logging_param = &log_msgs;
+  so.session_logid = "DequantizeLinear";
+  so.use_per_session_threads = false;
+  so.session_log_verbosity_level = 1;
+  so.graph_optimization_level = TransformerLevel::Default;
+
+  for (int64_t i = 0, n = 2 * zero_point_block_count; i < n; ++i) x_zero_point.push_back(0);
+  for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f));
+  for (int i = 0; i < 8; ++i) {
+    x.push_back(i);
+    y.push_back(Tout(static_cast<float>(i) * 2.0f));
+  }
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", {2, scale_block_count}, x_scale);
+  test.AddInput<Tin>("x_zero_point", {2, zero_point_block_count}, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps);
+}
+
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
+                                                           int64_t scale_block_count,
+                                                           int64_t zero_point_block_count) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{2, 4};
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  SessionOptions so;
+  std::vector<std::string> log_msgs;  // redirect error messages
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+  so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
+                                const char* logid, const char* code_location, const char* message) {
+    ORT_UNUSED_PARAMETER(severity);
+    ORT_UNUSED_PARAMETER(category);
+    ORT_UNUSED_PARAMETER(logid);
+    ORT_UNUSED_PARAMETER(code_location);
+    std::vector<std::string>* v_ptr = reinterpret_cast<std::vector<std::string>*>(param);
+    std::vector<std::string>& msg_vector = *v_ptr;
+    msg_vector.push_back(std::string(message));
+  };
+  so.user_logging_param = &log_msgs;
+  so.session_logid = "DequantizeLinear";
+  so.use_per_session_threads = false;
+  so.session_log_verbosity_level = 1;
+  so.graph_optimization_level = TransformerLevel::Default;
+
+  for (int64_t i = 0, n = zero_point_block_count; i < n; ++i) x_zero_point.push_back(Tin(0, 0));
+  for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f));
+  for (int i = 0; i < 8; ++i) {
+    if (i & 1) x.push_back(Tin(i - 1, i));
+    y.push_back(Tout(static_cast<float>(i) * 2.0f));
+  }
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", {2, scale_block_count}, x_scale);
+  test.AddInput<Tin>("x_zero_point", {2, zero_point_block_count}, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps);
+}
+
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(int64_t block_size,
+                                                             int64_t scale_block_count,
+                                                             int64_t zero_point_block_count) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{2, 4};
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  SessionOptions so;
+  std::vector<std::string> log_msgs;  // redirect error messages
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+  so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
+                                const char* logid, const char* code_location, const char* message) {
+    ORT_UNUSED_PARAMETER(severity);
+    ORT_UNUSED_PARAMETER(category);
+    ORT_UNUSED_PARAMETER(logid);
+    ORT_UNUSED_PARAMETER(code_location);
+    std::vector<std::string>* v_ptr = reinterpret_cast<std::vector<std::string>*>(param);
+    std::vector<std::string>& msg_vector = *v_ptr;
+    msg_vector.push_back(std::string(message));
+  };
+  so.user_logging_param = &log_msgs;
+  so.session_logid = "DequantizeLinear";
+  so.use_per_session_threads = false;
+  so.session_log_verbosity_level = 1;
+  so.graph_optimization_level = TransformerLevel::Default;
+
+  for (int64_t i = 0, n = 2 * zero_point_block_count; i < n; i++) x_zero_point.push_back(Tin(0.0f));
+  for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f));
+  for (int i = 0; i < 8; ++i) x.push_back(Tin(static_cast<float>(i)));
+  for (int i = 0; i < 8; ++i) y.push_back(Tout(static_cast<float>(i) * 2.0f));
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", 1);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", {2, scale_block_count}, x_scale);
+  test.AddInput<Tin>("x_zero_point", {2, zero_point_block_count}, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps);
+}
+
+// test negative block size fail
+TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, float>(-3, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, MLFloat16>(-3, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, float>(-4, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, MLFloat16>(-4, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, float>(-5, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, MLFloat16>(-5, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, float>(-6, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, MLFloat16>(-1, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, float>(-1, 2, 2);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
+}
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, float>(-1, 2, 2);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, MLFloat16>(-2, 2, 2);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, float>(-3, 2, 2);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, MLFloat16>(-4, 2, 2);
+  }
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, float>(-5, 2, 2);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, MLFloat16>(-6, 2, 2);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, float>(-1, 2, 2);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, MLFloat16>(-1, 2, 2);
+  }
+}
+#endif
+
+// test block size incompatible with x_scale shape fail
+TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, float>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, MLFloat16>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, float>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, MLFloat16>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, float>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, MLFloat16>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, float>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, MLFloat16>(3, 1, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, float>(3, 3, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
+}
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, float>(3, 1, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, MLFloat16>(3, 3, 3);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, float>(3, 1, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, MLFloat16>(3, 3, 3);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, float>(3, 1, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, MLFloat16>(3, 3, 3);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, float>(3, 1, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, MLFloat16>(3, 3, 3);
+  }
+}
+#endif
+
+// test x_scale vs. x_zero_point shape incompatible fail
+TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, float>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int8_t, MLFloat16>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, float>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint8_t, MLFloat16>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, float>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int16_t, MLFloat16>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, float>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<uint16_t, MLFloat16>(3, 2, 1);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, float>(3, 2, 3);
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
+}
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, float>(3, 2, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FN, MLFloat16>(3, 2, 3);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, float>(3, 2, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2, MLFloat16>(3, 2, 3);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, float>(3, 2, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E4M3FNUZ, MLFloat16>(3, 2, 3);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, float>(3, 2, 1);
+    DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8<Float8E5M2FNUZ, MLFloat16>(3, 2, 3);
+  }
+}
+#endif
+
+// test DQ with blocked quantization succeed
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
+                                                  int64_t axis,
+                                                  int64_t block_size,
+                                                  std::vector<int>& x_,
+                                                  std::vector<float>& x_scale_,
+                                                  std::vector<int>& x_zero_point_,
+                                                  std::vector<float>& y_) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> x_scale_shape;
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+
+  int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
+  bool use_zero_point = !x_zero_point_.empty();
+
+  for (auto v : y_) y.push_back(Tout(v));
+  for (auto v : x_scale_) x_scale.push_back(Tout(v));
+  for (size_t i = 0, n = dims.size(); i < n; ++i) {
+    x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]);
+  }
+
+  size_t i = 0, n = x_.size();
+  for (; i < n - 1; i += 2) x.push_back(Tin(x_[i], x_[i + 1]));
+  if (i < n) x.push_back(Tin(x_[i], 0xF));
+
+  if (use_zero_point) {
+    i = 0, n = x_zero_point_.size();
+    for (; i < n - 1; i += 2) x_zero_point.push_back(Tin(x_zero_point_[i], x_zero_point_[i + 1]));
+    if (i < n) x_zero_point.push_back(Tin(x_zero_point_[i], 0xF));
+  }
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", axis);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", x_scale_shape, x_scale);
+  if (use_zero_point) test.AddInput<Tin>("x_zero_point", x_scale_shape, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
+}
+
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_Int_Succeed(std::vector<int64_t>&& dims,
+                                                 int64_t axis,
+                                                 int64_t block_size,
+                                                 std::vector<int>& x_,
+                                                 std::vector<float>& x_scale_,
+                                                 std::vector<int>& x_zero_point_,
+                                                 std::vector<float>& y_) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> x_scale_shape;
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+
+  int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
+  bool use_zero_point = !x_zero_point_.empty();
+
+  for (auto v : y_) y.push_back(Tout(v));
+  for (auto v : x_scale_) x_scale.push_back(Tout(v));
+  for (size_t i = 0, n = dims.size(); i < n; ++i) {
+    x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]);
+  }
+  for (auto v : x_) x.push_back(v);
+  if (use_zero_point)
+    for (auto v : x_zero_point_) x_zero_point.push_back(v);
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", axis);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", x_scale_shape, x_scale);
+  if (use_zero_point) test.AddInput<Tin>("x_zero_point", x_scale_shape, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
+}
+
+template <typename Tin, typename Tout>
+void DequantizeLinearOp21BlockedTest_Float8_Succeed(std::vector<int64_t>&& dims,
+                                                    int64_t axis,
+                                                    int64_t block_size,
+                                                    std::vector<int>& x_,
+                                                    std::vector<float>& x_scale_,
+                                                    std::vector<int>& x_zero_point_,
+                                                    std::vector<float>& y_) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> x_scale_shape;
+  std::vector<Tout> x_scale, y;
+  std::vector<Tin> x, x_zero_point;
+  std::vector<std::unique_ptr<IExecutionProvider>> eps;
+  eps.push_back(DefaultCpuExecutionProvider());
+
+  int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
+  bool use_zero_point = !x_zero_point_.empty();
+
+  for (auto v : y_) y.push_back(Tout(v));
+  for (auto v : x_scale_) x_scale.push_back(Tout(v));
+  for (size_t i = 0, n = dims.size(); i < n; ++i) {
+    x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]);
+  }
+
+  for (auto v : x_) x.push_back(Tin(static_cast<float>(v)));
+  if (use_zero_point) {
+    for (auto v : x_zero_point_) x_zero_point.push_back(Tin(static_cast<float>(v)));
+  }
+
+  test.AddInput<Tin>("x", dims, x);
+  test.AddAttribute<int64_t>("axis", axis);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddInput<Tout>("x_scale", x_scale_shape, x_scale);
+  if (use_zero_point) test.AddInput<Tin>("x_zero_point", x_scale_shape, x_zero_point);
+  test.AddOutput<Tout>("y", dims, y);
+  test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point;
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0};
+  std::vector<float> y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15};
+  std::vector<float> y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8};
+  std::vector<float> y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15};
+  std::vector<float> y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8};
+  std::vector<float> y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15};
+  std::vector<float> y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int8_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15};
+  std::vector<float> y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9};
+  std::vector<float> y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
+  std::vector<float> y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
+  std::vector<float> y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15};
+  std::vector<float> y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+}
+
+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9};
+  std::vector<float> y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint8_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+}
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<float> y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, -8.0};
+  std::vector<float> y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, -8.0};
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
+  }
+}
+
+TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_MiddleAxis) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<float> y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, -8};
+  std::vector<float> y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, -8};
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
+  }
+}
+
+TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_LastAxis) {
+  constexpr int min_cuda_architecture = 11080;
+  bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
+
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+  std::vector<float> y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, -8};
+  std::vector<float> y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, -8};
+
+  if (enable_cpu || enable_cuda) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FN, MLFloat16>(
+        {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  }
+  if (enable_cpu) {
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E4M3FNUZ, MLFloat16>(
+        {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+    DequantizeLinearOp21BlockedTest_Float8_Succeed<Float8E5M2FNUZ, MLFloat16>(
+        {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
+  }
+}
+#endif
+}  // namespace blocked_dequantization
+
 }  // namespace test
 }  // namespace onnxruntime

From b374ddd70408c6868eb8418557bf2b2c08fa451e Mon Sep 17 00:00:00 2001
From: Yueqing Zhang <yuz75@Pitt.edu>
Date: Wed, 5 Jun 2024 13:48:04 +0800
Subject: [PATCH 16/26] [VitisAI] add new api for models (#20899)

### Description
<!-- Describe your changes. -->
Add new APIs.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This change is required for satisfying requirement of Microsoft.

---------

Co-authored-by: Zhenze Wang <zhenzew@xilinx.com>
---
 .../core/providers/vitisai/imp/global_api.cc  | 12 +++-
 .../providers/vitisai/imp/tensor_proto.cc     | 59 +++++++++++++++----
 .../core/providers/vitisai/imp/tensor_proto.h | 16 +++++
 .../vitisai/include/vaip/vaip_ort_api.h       | 27 ++++++++-
 .../python/onnxruntime_pybind_state.cc        |  9 ++-
 5 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 00fb8c1578ff4..a963e656c457b 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -356,10 +356,18 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
   the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape;
   the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); };
   the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; };
-  the_global_api.tensor_proto_new_floats = vaip::tensor_proto_new_floats;
+  the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8;
+  the_global_api.tensor_proto_new_i16 = vaip::tensor_proto_new_i16;
   the_global_api.tensor_proto_new_i32 = vaip::tensor_proto_new_i32;
   the_global_api.tensor_proto_new_i64 = vaip::tensor_proto_new_i64;
-  the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8;
+  the_global_api.tensor_proto_new_u8 = vaip::tensor_proto_new_u8;
+  the_global_api.tensor_proto_new_u16 = vaip::tensor_proto_new_u16;
+  the_global_api.tensor_proto_new_u32 = vaip::tensor_proto_new_u32;
+  the_global_api.tensor_proto_new_u64 = vaip::tensor_proto_new_u64;
+  the_global_api.tensor_proto_new_floats = vaip::tensor_proto_new_floats;
+  the_global_api.tensor_proto_new_doubles = vaip::tensor_proto_new_doubles;
+  the_global_api.tensor_proto_new_bf16 = vaip::tensor_proto_new_bf16;
+  the_global_api.tensor_proto_new_fp16 = vaip::tensor_proto_new_fp16;
   the_global_api.tensor_proto_raw_data_size = [](const auto& tensor) { return tensor.raw_data().size(); };
   the_global_api.tensor_proto_as_raw = vaip::tensor_proto_as_raw;
   the_global_api.tensor_proto_get_name = [](const auto& tensor) -> const std::string& { return tensor.name(); };
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
index 671d852abb0d6..63aa1daf7e18f 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
@@ -50,28 +50,67 @@ static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, co
   return tensor_proto.release();
 }
 
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<int8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i16(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int16_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT16,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector<int64_t>& shape,
                                                   const std::vector<int32_t>& data) {
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT32,
-                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int32_t));
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
 }
-
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector<int64_t>& shape,
                                                   const std::vector<int64_t>& data) {
-  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT64,
-                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int64_t));
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT32,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
 }
-
-ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
-                                                 const std::vector<int8_t>& data) {
-  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8,
-                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(int8_t));
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<uint8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT8,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u16(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint16_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT16,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u32(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint32_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT32,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u64(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint64_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT32,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
 }
 
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector<int64_t>& shape,
                                                      const std::vector<float>& data) {
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
-                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(float));
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector<int64_t>& shape,
+                                                      const std::vector<double>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
 }
 
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_bf16(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<int16_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<int16_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_FLOAT16,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
index 292905ca734f1..417f9d2f4bf31 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
@@ -11,10 +11,26 @@ vaip_core::DllSafe<std::vector<int64_t>> tensor_proto_get_shape(const ONNX_NAMES
 const std::string& tensor_proto_get_name(const ONNX_NAMESPACE::TensorProto& tensor);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector<int64_t>& shape,
                                                  const std::vector<int8_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector<int64_t>& shape,
+                                                 const std::vector<uint8_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_i16(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<int16_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u16(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint16_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u32(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint32_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector<int64_t>& shape,
                                                   const std::vector<int32_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_u64(const std::string& name, const std::vector<int64_t>& shape,
+                                                  const std::vector<uint64_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector<int64_t>& shape,
                                                   const std::vector<int64_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector<int64_t>& shape,
                                                      const std::vector<float>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_bf16(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<int16_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<int16_t>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector<int64_t>& shape,
+                                                      const std::vector<double>& data);
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index 2c12d26fd2c31..62a7bb602e7e8 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -12,7 +12,7 @@ struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (2u)
+#define VAIP_ORT_API_MAJOR (3u)
 #define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
@@ -198,6 +198,31 @@ struct OrtApiForVaip {
   DllSafe<std::string> (*get_lib_name)();                                         // [81]
                                                                                   /** new API after 2.0 */
   void (*graph_add_initialized_tensor)(Graph& graph, const TensorProto& tensor);  // [82]
+                                                                                  /** new API after 3.0 */
+  TensorProto* (*tensor_proto_new_doubles)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<double>& data);  // [83]
+  TensorProto* (*tensor_proto_new_i16)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<int16_t>& data);  // [84
+  TensorProto* (*tensor_proto_new_u16)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<uint16_t>& data);  // [84]
+  TensorProto* (*tensor_proto_new_u32)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<uint32_t>& data);  // [85]
+  TensorProto* (*tensor_proto_new_u8)(const std::string& name,
+                                      const std::vector<int64_t>& shape,
+                                      const std::vector<uint8_t>& data);  // [86]
+  TensorProto* (*tensor_proto_new_u64)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<uint64_t>& data);  // [87]
+  TensorProto* (*tensor_proto_new_fp16)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<int16_t>& data);  // [88]
+  TensorProto* (*tensor_proto_new_bf16)(
+      const std::string& name, const std::vector<int64_t>& shape,
+      const std::vector<int16_t>& data);  // [89]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 7f9a6e13d7864..b1784f700d1fa 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1066,13 +1066,12 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #endif
   } else if (type == kVitisAIExecutionProvider) {
 #ifdef USE_VITISAI
+    ProviderOptions info{};
     const auto it = provider_options_map.find(type);
-    if (it == provider_options_map.end()) {
-      LOGS_DEFAULT(FATAL) << "cannot find provider options for VitisAIExecutionProvider";
+    if (it != provider_options_map.end()) {
+      info = it->second;
     }
-    const auto& vitis_option_map = it->second;
-    return onnxruntime::VitisAIProviderFactoryCreator::Create(vitis_option_map)
-        ->CreateProvider();
+    return onnxruntime::VitisAIProviderFactoryCreator::Create(info)->CreateProvider();
 #endif
   } else if (type == kAclExecutionProvider) {
 #ifdef USE_ACL

From 63c13a4811cdf6d65922b7e6c21fe51e2befcc61 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Wed, 5 Jun 2024 10:19:26 -0700
Subject: [PATCH 17/26] fix integer overflow in Attention (#20921)

### Description
<!-- Describe your changes. -->
offset used in attention is with data type int. It can overflow for
large sequence length.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../contrib_ops/cpu/bert/attention_cpu_base.h | 106 ++++++++--------
 .../contrib_ops/cpu/bert/gqa_attention_base.h | 114 +++++++++---------
 .../test/python/transformers/test_gqa_cpu.py  |   1 +
 3 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
index 34f57c1655cc2..8ae7b4589d677 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -69,9 +69,8 @@ class AttentionCPUBase : public AttentionBase {
     BufferUniquePtr mask_data_buffer(mask_data, BufferDeleter(allocator));
 
     const int32_t* mask_index_data = mask_index != nullptr ? mask_index->Data<int32_t>() : nullptr;
-    gsl::span<const int64_t> mask_index_dims = mask_index != nullptr
-                                                   ? mask_index->Shape().GetDims()
-                                                   : gsl::span<const int64_t>{};
+    gsl::span<const int64_t> mask_index_dims =
+        mask_index != nullptr ? mask_index->Shape().GetDims() : gsl::span<const int64_t>{};
     const T* past_data = past != nullptr ? past->Data<T>() : nullptr;
     T* present_data = present != nullptr ? present->MutableData<T>() : nullptr;
     const T* past_key_data = past_key != nullptr ? past_key->Data<T>() : nullptr;
@@ -84,22 +83,19 @@ class AttentionCPUBase : public AttentionBase {
       relative_position_bias_data = relative_position_bias->Data<T>();
     }
 
-    ComputeAttentionProbs<T>(static_cast<T*>(attention_probs), Q, K,
-                             mask_index_data, mask_index_dims, static_cast<T*>(mask_data), causal,
-                             batch_size, sequence_length, kv_sequence_length, past_sequence_length,
-                             qk_head_size == 0 ? v_head_size : qk_head_size, past_data, past_key_data,
-                             present_data, present_key_data, tp, relative_position_bias_data);
+    ComputeAttentionProbs<T>(static_cast<T*>(attention_probs), Q, K, mask_index_data, mask_index_dims,
+                             static_cast<T*>(mask_data), causal, batch_size, sequence_length, kv_sequence_length,
+                             past_sequence_length, qk_head_size == 0 ? v_head_size : qk_head_size, past_data,
+                             past_key_data, present_data, present_key_data, tp, relative_position_bias_data);
 
     // Compute the attentionScore * Value: out_tmp(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v)
     auto out_tmp_data =
         allocator->Alloc(SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * v_head_size * sizeof(T));
     BufferUniquePtr out_tmp_buffer(out_tmp_data, BufferDeleter(std::move(allocator)));
 
-    ComputeVxAttentionScore(output->MutableData<T>(), static_cast<T*>(out_tmp_data),
-                            static_cast<T*>(attention_probs), V,
-                            batch_size, sequence_length, kv_sequence_length, past_sequence_length,
-                            v_head_size, v_hidden_size, past_data, past_value_data,
-                            present_data, present_value_data, tp);
+    ComputeVxAttentionScore(output->MutableData<T>(), static_cast<T*>(out_tmp_data), static_cast<T*>(attention_probs),
+                            V, batch_size, sequence_length, kv_sequence_length, past_sequence_length, v_head_size,
+                            v_hidden_size, past_data, past_value_data, present_data, present_value_data, tp);
 
     return Status::OK();
   }
@@ -138,16 +134,17 @@ class AttentionCPUBase : public AttentionBase {
     {
       // mask_data is nullptr when mask_index is nullptr and not unidirectional, otherwise its shape is BxSxT
       if (mask_data != nullptr) {
-        PrepareMask(mask_index, mask_index_dims, mask_data,
-                    causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_);
+        PrepareMask(mask_index, mask_index_dims, mask_data, causal, batch_size, sequence_length, past_sequence_length,
+                    mask_filter_value_);
       }
 
       const int loop_len = batch_size * num_heads_;
       const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast<float>(head_size)) : scale_;
 
       TensorOpCost unit_cost;
-      const size_t probs_matrix_bytes = SafeInt<size_t>(sequence_length) * total_sequence_length * sizeof(T);
-      unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * head_size * total_sequence_length);
+      const ptrdiff_t probs_matrix_bytes = SafeInt<ptrdiff_t>(sequence_length) * total_sequence_length * sizeof(T);
+      unit_cost.compute_cycles =
+          static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * head_size * total_sequence_length);
       unit_cost.bytes_loaded = static_cast<double>((sequence_length + total_sequence_length) * head_size * sizeof(T));
       unit_cost.bytes_stored = static_cast<double>(probs_matrix_bytes);
 
@@ -172,15 +169,13 @@ class AttentionCPUBase : public AttentionBase {
         for (std::ptrdiff_t i = begin; i != end; ++i) {
           const int batch_index = static_cast<int>(i) / num_heads_;
 
-          const int output_offset = static_cast<int>(i) * sequence_length * total_sequence_length;
-          const int mask_offset = batch_index * sequence_length * total_sequence_length;
+          const ptrdiff_t output_offset = SafeInt<ptrdiff_t>(i) * sequence_length * total_sequence_length;
+          const ptrdiff_t mask_offset = SafeInt<ptrdiff_t>(batch_index) * sequence_length * total_sequence_length;
           T* output = attention_probs + output_offset;
 
           // Broadcast mask data: (Bx)SxT -> (BxNx)SxT
           if (mask_data != nullptr) {
-            memcpy(output,
-                   mask_data + mask_offset,
-                   probs_matrix_bytes);
+            memcpy(output, mask_data + mask_offset, probs_matrix_bytes);
           }
 
           const T* k = K + kv_input_chunk_length * i;
@@ -197,8 +192,8 @@ class AttentionCPUBase : public AttentionBase {
           // B: K'               (B x N x) T x H          (B x N x) H x T        H x T
           // C: attention_probs  (B x N x) S x T          (B x N x) S x T        S x T
           math::Gemm<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_sequence_length, head_size, alpha,
-                                    Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f,
-                                    output, nullptr);
+                                    Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f, output,
+                                    nullptr);
 
           if (relative_position_bias_data != nullptr) {
             for (int j = 0; j < sequence_length * total_sequence_length; j++) {
@@ -249,8 +244,10 @@ class AttentionCPUBase : public AttentionBase {
 
     // The cost of Gemm
     TensorOpCost unit_cost;
-    unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * v_head_size * total_sequence_length);
-    unit_cost.bytes_loaded = static_cast<double>((sequence_length + v_head_size) * total_sequence_length * sizeof(T));
+    unit_cost.compute_cycles =
+        static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * v_head_size * total_sequence_length);
+    unit_cost.bytes_loaded =
+        static_cast<double>(SafeInt<ptrdiff_t>(sequence_length + v_head_size) * total_sequence_length * sizeof(T));
     unit_cost.bytes_stored = static_cast<double>(sequence_length * v_head_size * sizeof(T));
 
     if (present || present_value) {
@@ -264,35 +261,36 @@ class AttentionCPUBase : public AttentionBase {
     unit_cost.bytes_loaded += bytes_to_copy_trans_all;
     unit_cost.bytes_stored += bytes_to_copy_trans_all;
 
-    ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
-      for (std::ptrdiff_t i = begin; i != end; ++i) {
-        const T* v = V + kv_input_chunk_length * i;
-        if (nullptr != present) {
-          // Concatenate past_V and V: (BxNx)PxH_v, (BxNx)LxH_v -> (BxNx)TxH_v
-          v = ConcatStateChunk(past, v, present, past_chunk_length, present_chunk_length, i);
-        } else if (nullptr != present_value) {
-          v = ConcatStateChunk(past_value, v, present_value, past_chunk_length, present_chunk_length, i);
-        }
+    ThreadPool::TryParallelFor(
+        tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+          for (std::ptrdiff_t i = begin; i != end; ++i) {
+            const T* v = V + kv_input_chunk_length * i;
+            if (nullptr != present) {
+              // Concatenate past_V and V: (BxNx)PxH_v, (BxNx)LxH_v -> (BxNx)TxH_v
+              v = ConcatStateChunk(past, v, present, past_chunk_length, present_chunk_length, i);
+            } else if (nullptr != present_value) {
+              v = ConcatStateChunk(past_value, v, present_value, past_chunk_length, present_chunk_length, i);
+            }
 
-        T* current_tmp_data = reinterpret_cast<T*>(tmp_buffer) + q_input_chunk_length * i;
-        ptrdiff_t attention_probs_offset = SafeInt<ptrdiff_t>(sequence_length) * total_sequence_length * i;
-        math::MatMul<T>(sequence_length, v_head_size, total_sequence_length,
-                        attention_probs + attention_probs_offset,
-                        v, current_tmp_data, nullptr);
-
-        // Transpose: out(B, S, N, H_v) -> out_tmp(B, N, S, H_v)
-        const int batch_index = static_cast<int>(i / num_heads_);
-        const int head_index = static_cast<int>(i % num_heads_);
-        T* src = current_tmp_data;
-        ptrdiff_t dest_offset = (SafeInt<ptrdiff_t>(batch_index) * sequence_length * num_heads_ + head_index) * v_head_size;
-        T* dest = output + dest_offset;
-        for (int j = 0; j < sequence_length; j++) {
-          memcpy(dest, src, bytes_to_copy_trans);
-          src += v_head_size;
-          dest += v_hidden_size;
-        }
-      }
-    });
+            T* current_tmp_data = reinterpret_cast<T*>(tmp_buffer) + q_input_chunk_length * i;
+            ptrdiff_t attention_probs_offset = SafeInt<ptrdiff_t>(sequence_length) * total_sequence_length * i;
+            math::MatMul<T>(sequence_length, v_head_size, total_sequence_length,
+                            attention_probs + attention_probs_offset, v, current_tmp_data, nullptr);
+
+            // Transpose: out(B, S, N, H_v) -> out_tmp(B, N, S, H_v)
+            const int batch_index = static_cast<int>(i / num_heads_);
+            const int head_index = static_cast<int>(i % num_heads_);
+            T* src = current_tmp_data;
+            ptrdiff_t dest_offset =
+                (SafeInt<ptrdiff_t>(batch_index) * sequence_length * num_heads_ + head_index) * v_head_size;
+            T* dest = output + dest_offset;
+            for (int j = 0; j < sequence_length; j++) {
+              memcpy(dest, src, bytes_to_copy_trans);
+              src += v_head_size;
+              dest += v_hidden_size;
+            }
+          }
+        });
   }
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index fa80efffc9ea1..6b0c5f395cab0 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -63,17 +63,16 @@ class GQAAttentionBase : public AttentionBase {
     bool past_present_share_buffer = past_key_data == present_key_data && past_value_data == present_value_data;
 
     const T* k = packed_qkv ? Q + num_heads_ * sequence_length * head_size : K;
-    ComputeAttentionProbs<T>(static_cast<T*>(attention_probs), Q, k,
-                             seqlens_k->Data<int32_t>(),
-                             batch_size, sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache,
-                             head_size, past_key_data, present_key_data, past_present_share_buffer, packed_qkv, tp);
+    ComputeAttentionProbs<T>(static_cast<T*>(attention_probs), Q, k, seqlens_k->Data<int32_t>(), batch_size,
+                             sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache, head_size, past_key_data,
+                             present_key_data, past_present_share_buffer, packed_qkv, tp);
 
     // Compute the attentionScore * Value: out(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v)
     const T* v = packed_qkv ? Q + (num_heads_ + kv_num_heads_) * sequence_length * head_size : V;
-    ComputeVxAttentionScore(output->MutableData<T>(), static_cast<T*>(attention_probs),
-                            v, seqlens_k->Data<int32_t>(), batch_size, sequence_length, seqlen_past_kv_cache,
-                            seqlen_present_kv_cache, head_size, hidden_size, past_value_data, present_value_data,
-                            past_present_share_buffer, packed_qkv, tp);
+    ComputeVxAttentionScore(output->MutableData<T>(), static_cast<T*>(attention_probs), v, seqlens_k->Data<int32_t>(),
+                            batch_size, sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache, head_size,
+                            hidden_size, past_value_data, present_value_data, past_present_share_buffer, packed_qkv,
+                            tp);
 
     return Status::OK();
   }
@@ -98,7 +97,9 @@ class GQAAttentionBase : public AttentionBase {
                              bool packed_qkv,                     // whether Q, K, V are packed
                              ThreadPool* tp) const {              // thread pool
     const bool is_prompt = sequence_length != 1;
-    const int packed_batch_stride = packed_qkv ? (num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size : 0;
+    const ptrdiff_t packed_batch_stride =
+        packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
+                   : SafeInt<ptrdiff_t>(0);
     const int kv_num_heads_factor = num_heads_ / kv_num_heads_;
     const size_t q_input_chunk_length = static_cast<size_t>(sequence_length) * head_size;                      // S x H
     const size_t kv_input_chunk_length = static_cast<size_t>(sequence_length) * head_size;                     // L x H
@@ -113,9 +114,12 @@ class GQAAttentionBase : public AttentionBase {
     const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast<float>(head_size)) : scale_;
 
     TensorOpCost unit_cost;
-    const size_t probs_matrix_bytes = SafeInt<size_t>(sequence_length) * present_buffer_sequence_length * sizeof(T);
-    unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * head_size * present_buffer_sequence_length);
-    unit_cost.bytes_loaded = static_cast<double>((sequence_length + present_buffer_sequence_length) * head_size * sizeof(T));
+    const ptrdiff_t probs_matrix_bytes =
+        SafeInt<ptrdiff_t>(sequence_length) * present_buffer_sequence_length * sizeof(T);
+    unit_cost.compute_cycles =
+        static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * head_size * present_buffer_sequence_length);
+    unit_cost.bytes_loaded =
+        static_cast<double>((sequence_length + present_buffer_sequence_length) * head_size * sizeof(T));
     unit_cost.bytes_stored = static_cast<double>(probs_matrix_bytes);
 
     unit_cost.bytes_loaded += static_cast<double>(probs_matrix_bytes);
@@ -131,11 +135,12 @@ class GQAAttentionBase : public AttentionBase {
       for (std::ptrdiff_t i = begin; i != end; ++i) {
         const int batch_index = static_cast<int>(i) / num_heads_;
         const int head_index = static_cast<int>(i) % num_heads_;
-        const int past_seqlen = sequence_length == 1 ? static_cast<int>(seqlens_k[batch_index]) : past_buffer_sequence_length;
+        const int past_seqlen =
+            sequence_length == 1 ? static_cast<int>(seqlens_k[batch_index]) : past_buffer_sequence_length;
         const size_t past_chunk_length = static_cast<size_t>(past_seqlen) * head_size;
         const int total_seqlen = seqlens_k[batch_index] + 1;
 
-        const int output_offset = static_cast<int>(i) * sequence_length * present_buffer_sequence_length;
+        const ptrdiff_t output_offset = SafeInt<ptrdiff_t>(i) * sequence_length * present_buffer_sequence_length;
         T* output = attention_probs + output_offset;
 
         const T* k;
@@ -161,11 +166,9 @@ class GQAAttentionBase : public AttentionBase {
         } else {
           q = Q + q_input_chunk_length * i;
         }
-        math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasTrans,
-                                    sequence_length, total_seqlen, head_size, alpha,
-                                    q, head_size, k, head_size,
-                                    0.0f /*bata*/,
-                                    output, present_buffer_sequence_length, nullptr);
+        math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, alpha, q,
+                                    head_size, k, head_size, 0.0f /*bata*/, output, present_buffer_sequence_length,
+                                    nullptr);
 
         // compute Softmax
         T* output_softmax = output;
@@ -175,7 +178,8 @@ class GQAAttentionBase : public AttentionBase {
             for (int total_seq_id = 0; total_seq_id < seq_causal_length - local_window_size_ - 1; total_seq_id++) {
               output_softmax[total_seq_id] = 0.f;
             }
-            ComputeAttentionSoftmaxInplace(output_softmax + seq_causal_length - local_window_size_ - 1, 1, local_window_size_ + 1, nullptr);
+            ComputeAttentionSoftmaxInplace(output_softmax + seq_causal_length - local_window_size_ - 1, 1,
+                                           local_window_size_ + 1, nullptr);
           } else {
             ComputeAttentionSoftmaxInplace(output_softmax, 1, seq_causal_length, nullptr);
           }
@@ -208,7 +212,9 @@ class GQAAttentionBase : public AttentionBase {
                                bool packed_qkv,                     // whether Q, K, V are packed
                                ThreadPool* tp) const {
     const bool is_prompt = sequence_length != 1;
-    const int packed_batch_stride = packed_qkv ? (num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size : 0;
+    const ptrdiff_t packed_batch_stride =
+        packed_qkv ? SafeInt<ptrdiff_t>(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size
+                   : SafeInt<ptrdiff_t>(0);
     const int kv_num_heads_factor = num_heads_ / kv_num_heads_;
     const int kv_input_chunk_length = sequence_length * head_size;                                             // L x H
     const size_t past_buff_chunk_length = static_cast<size_t>(past_buffer_sequence_length) * head_size;        // L x H
@@ -220,8 +226,10 @@ class GQAAttentionBase : public AttentionBase {
 
     // The cost of Gemm
     TensorOpCost unit_cost;
-    unit_cost.compute_cycles = static_cast<double>(2 * sequence_length * head_size * present_buffer_sequence_length);
-    unit_cost.bytes_loaded = static_cast<double>((sequence_length + head_size) * present_buffer_sequence_length * sizeof(T));
+    unit_cost.compute_cycles =
+        static_cast<double>(SafeInt<ptrdiff_t>(2) * sequence_length * head_size * present_buffer_sequence_length);
+    unit_cost.bytes_loaded = static_cast<double>(SafeInt<ptrdiff_t>(sequence_length + head_size) *
+                                                 present_buffer_sequence_length * sizeof(T));
     unit_cost.bytes_stored = static_cast<double>(sequence_length * head_size * sizeof(T));
 
     if (present_value) {
@@ -235,39 +243,37 @@ class GQAAttentionBase : public AttentionBase {
     unit_cost.bytes_loaded += bytes_to_copy_trans_all;
     unit_cost.bytes_stored += bytes_to_copy_trans_all;
 
-    ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
-      for (std::ptrdiff_t i = begin; i != end; ++i) {
-        const int batch_index = static_cast<int>(i / num_heads_);
-        const int head_index = static_cast<int>(i % num_heads_);
-        const int past_seqlen = sequence_length == 1 ? static_cast<int>(seqlens_k[batch_index]) : past_buffer_sequence_length;
-        const size_t past_chunk_length = static_cast<size_t>(past_seqlen) * head_size;
-        const int total_seqlen = seqlens_k[batch_index] + 1;
+    ThreadPool::TryParallelFor(
+        tp, SafeInt<ptrdiff_t>(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+          for (std::ptrdiff_t i = begin; i != end; ++i) {
+            const int batch_index = static_cast<int>(i / num_heads_);
+            const int head_index = static_cast<int>(i % num_heads_);
+            const int past_seqlen =
+                sequence_length == 1 ? static_cast<int>(seqlens_k[batch_index]) : past_buffer_sequence_length;
+            const size_t past_chunk_length = static_cast<size_t>(past_seqlen) * head_size;
+            const int total_seqlen = seqlens_k[batch_index] + 1;
+
+            const T* v;
+            if (packed_qkv) {
+              v = V + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor);
+            } else {
+              v = V + kv_input_chunk_length * (i / kv_num_heads_factor);
+            }
+            if (nullptr != present_value) {
+              v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length,
+                                      past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
+                                      i / kv_num_heads_factor);
+            }
 
-        const T* v;
-        if (packed_qkv) {
-          v = V + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor);
-        } else {
-          v = V + kv_input_chunk_length * (i / kv_num_heads_factor);
-        }
-        if (nullptr != present_value) {
-          v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length,
-                                  past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer,
-                                  i / kv_num_heads_factor);
-        }
+            T* output_current = output + (batch_index * sequence_length * num_heads_ + head_index) * head_size;
+            ptrdiff_t attention_probs_offset = SafeInt<ptrdiff_t>(sequence_length) * present_buffer_sequence_length * i;
 
-        T* output_current = output + (batch_index * sequence_length * num_heads_ + head_index) * head_size;
-        ptrdiff_t attention_probs_offset = SafeInt<ptrdiff_t>(sequence_length) * present_buffer_sequence_length * i;
-
-        math::GemmEx<T, ThreadPool>(CblasNoTrans,
-                                    CblasNoTrans,
-                                    sequence_length, head_size, total_seqlen,
-                                    1.f, /*alpha*/
-                                    attention_probs + attention_probs_offset, present_buffer_sequence_length,
-                                    v, head_size,
-                                    0.0f /*beta*/,
-                                    output_current, hidden_size, nullptr);
-      }
-    });
+            math::GemmEx<T, ThreadPool>(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seqlen,
+                                        1.f, /*alpha*/
+                                        attention_probs + attention_probs_offset, present_buffer_sequence_length, v,
+                                        head_size, 0.0f /*beta*/, output_current, hidden_size, nullptr);
+          }
+        });
   }
 };
 
diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py
index 4df1ac1cc2b7e..b6b8aee15852f 100644
--- a/onnxruntime/test/python/transformers/test_gqa_cpu.py
+++ b/onnxruntime/test/python/transformers/test_gqa_cpu.py
@@ -1775,6 +1775,7 @@ def test_gqa_no_past(self):
                 (2000, 2000),
                 (200, 200),
                 (240, 240),
+                (8000, 8000),
             ]
         )
         num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]

From 4cb23b020c87c0577a6672ef4775d36113a8a6b1 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <49959681+ChipKerchner@users.noreply.github.com>
Date: Wed, 5 Jun 2024 17:24:22 -0400
Subject: [PATCH 18/26] Improvements to the INT8 GEMM portion of the code for
 Power (#20595)

These are changes to improve GEMM portion of the code for Power.

There are 2 main code changes :
1) Changing a function to a template parameter so that operations that
add/sub zero are eliminated at compile time. Plus reuse a vector that
has the mask instead of rebuilding each time.
2) Add processing 16 columns at a time in MlasGemmQuantCopyPackB8x8 -
this should reduce potential page faults by a factor of 4 and also be
faster.
3) Unroll MlasQgemmStoreVectorMMA and vectorize other variables.
---
 .../mlas/lib/power/qgemm_kernel_power10.cpp   | 590 +++++++++++-------
 1 file changed, 381 insertions(+), 209 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
index 633349e800875..a67be1dbfa710 100644
--- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
+++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
@@ -67,7 +67,7 @@ MlasGemmQuantFixupZeroPointB<MLAS_GEMM_QUANT_KERNEL_POWER10>(
 
 }
 
-template<typename Vtype>
+template<typename Vtype, bool AIsSigned>
 void
 MlasGemmQuantCopyPackA8x8(
     MLAS_GEMM_QUANT_KERNEL_POWER10::PackedAType* D,
@@ -75,11 +75,10 @@ MlasGemmQuantCopyPackA8x8(
     size_t lda,
     size_t CountM,
     size_t CountK,
-    int32_t* RowSumBuffer,
-    bool AIsSigned
+    int32_t* RowSumBuffer
     )
 {
-    const uint8_t Flip = (AIsSigned ? 0 : 0x80);
+    constexpr uint8_t Flip = (AIsSigned ? 0 : 0x80);
     Vtype vmask = reinterpret_cast<Vtype>(vec_splats(Flip));
     typedef __vector signed char vec_t;
 
@@ -106,66 +105,74 @@ MlasGemmQuantCopyPackA8x8(
             Vtype a3 = *reinterpret_cast<const Vtype *>(&a[lda * 2]);
             Vtype a4 = *reinterpret_cast<const Vtype *>(&a[lda * 3]);
             Vtype vx =
-               reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+               reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                            reinterpret_cast<__vector int>(a2)));
             Vtype vx1 =
-               reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+               reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                            reinterpret_cast<__vector int>(a4)));
             Vtype vx2 =
-               reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a1),
+               reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a1),
                            reinterpret_cast<__vector int>(a2)));
             Vtype vx3 =
-               reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a3),
+               reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a3),
                            reinterpret_cast<__vector int>(a4)));
-            Vtype vx4 = vec_xxpermdi (vx, vx1, 0);
-            Vtype vx5 = vec_xxpermdi (vx2, vx3, 0);
-            Vtype vx6 = vec_xxpermdi (vx, vx1, 3);
-            Vtype vx7 = vec_xxpermdi (vx2, vx3, 3);
+            Vtype vx4 = vec_xxpermdi(vx, vx1, 0);
+            Vtype vx5 = vec_xxpermdi(vx2, vx3, 0);
+            Vtype vx6 = vec_xxpermdi(vx, vx1, 3);
+            Vtype vx7 = vec_xxpermdi(vx2, vx3, 3);
             a1 = *reinterpret_cast<const Vtype *>(&a[lda*4]);
             a2 = *reinterpret_cast<const Vtype *>(&a[lda*5]);
             a3 = *reinterpret_cast<const Vtype *>(&a[lda*6]);
             a4 = *reinterpret_cast<const Vtype *>(&a[lda*7]);
             vx =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             vx1 =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
             vx2 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             vx3 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
-            Vtype vx8 = vec_xxpermdi (vx, vx1, 0);
-            Vtype vx9 = vec_xxpermdi (vx2, vx3, 0);
-            Vtype vx10 = vec_xxpermdi (vx, vx1, 3);
-            Vtype vx11 = vec_xxpermdi (vx2, vx3, 3);
+            Vtype vx8 = vec_xxpermdi(vx, vx1, 0);
+            Vtype vx9 = vec_xxpermdi(vx2, vx3, 0);
+            Vtype vx10 = vec_xxpermdi(vx, vx1, 3);
+            Vtype vx11 = vec_xxpermdi(vx2, vx3, 3);
             vec_t vxx =
-              reinterpret_cast<vec_t>(vec_sub (vx4, vmask));
-            vsum = vec_sum4s (vxx, vsum);
+              AIsSigned ? reinterpret_cast<vec_t>(vx4) :
+                          reinterpret_cast<vec_t>(vec_sub(vx4, vmask));
+            vsum = vec_sum4s(vxx, vsum);
             *reinterpret_cast<vec_t *>(&D[0]) = vxx;
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx5, vmask));
-            vsum = vec_sum4s (vxx, vsum);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx5) :
+                              reinterpret_cast<vec_t>(vec_sub(vx5, vmask));
+            vsum = vec_sum4s(vxx, vsum);
             *reinterpret_cast<vec_t *>(&D[16]) = vxx;
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx6, vmask));
-            vsum = vec_sum4s (vxx, vsum);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx6) :
+                              reinterpret_cast<vec_t>(vec_sub(vx6, vmask));
+            vsum = vec_sum4s(vxx, vsum);
             *reinterpret_cast<vec_t *>(&D[32]) = vxx;
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx7, vmask));
-            vsum = vec_sum4s (vxx, vsum);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx7) :
+                              reinterpret_cast<vec_t>(vec_sub(vx7, vmask));
+            vsum = vec_sum4s(vxx, vsum);
             *reinterpret_cast<vec_t *>(&D[48]) = vxx;
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx8, vmask));
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx8) :
+                              reinterpret_cast<vec_t>(vec_sub(vx8, vmask));
             *reinterpret_cast<vec_t *>(&D[64]) = vxx;
-            vsum2 = vec_sum4s (vxx, vsum2);
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx9, vmask));
+            vsum2 = vec_sum4s(vxx, vsum2);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx9) :
+                              reinterpret_cast<vec_t>(vec_sub(vx9, vmask));
             *reinterpret_cast<vec_t *>(&D[80]) = vxx;
-            vsum2 = vec_sum4s (vxx, vsum2);
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx10, vmask));
+            vsum2 = vec_sum4s(vxx, vsum2);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx10) :
+                              reinterpret_cast<vec_t>(vec_sub(vx10, vmask));
             *reinterpret_cast<vec_t *>(&D[96]) = vxx;
-            vsum2 = vec_sum4s (vxx, vsum2);
-            vxx = reinterpret_cast<vec_t>(vec_sub (vx11, vmask));
+            vsum2 = vec_sum4s(vxx, vsum2);
+            vxx = AIsSigned ? reinterpret_cast<vec_t>(vx11) :
+                              reinterpret_cast<vec_t>(vec_sub(vx11, vmask));
             *reinterpret_cast<vec_t *>(&D[112]) = vxx;
-            vsum2 = vec_sum4s (vxx, vsum2);
+            vsum2 = vec_sum4s(vxx, vsum2);
             D += 16 * 8;
             a += 16;
             y -= 16;
@@ -179,16 +186,18 @@ MlasGemmQuantCopyPackA8x8(
             int a4 = *reinterpret_cast<const int *>(&a[lda*3]);
             __vector int vx1 = { a1, a2, a3, a4};
             vec_t vx =
-              reinterpret_cast<vec_t>(vec_sub (reinterpret_cast<Vtype>(vx1), vmask));
-            vsum = vec_sum4s (vx, vsum);
+              AIsSigned ? reinterpret_cast<vec_t>(vx1) :
+                          reinterpret_cast<vec_t>(vec_sub(reinterpret_cast<Vtype>(vx1), vmask));
+            vsum = vec_sum4s(vx, vsum);
             *reinterpret_cast<vec_t *>(&D[0]) = vx;
             a1 = *reinterpret_cast<const int *>(&a[lda*4]);
             a2 = *reinterpret_cast<const int *>(&a[lda*5]);
             a3 = *reinterpret_cast<const int *>(&a[lda*6]);
             a4 = *reinterpret_cast<const int *>(&a[lda*7]);
             __vector int vx2 = { a1, a2, a3, a4};
-            vx = reinterpret_cast<vec_t>(vec_sub (reinterpret_cast<Vtype>(vx2), vmask));
-            vsum2 = vec_sum4s (vx, vsum2);
+            vx = AIsSigned ? reinterpret_cast<vec_t>(vx2) :
+                             reinterpret_cast<vec_t>(vec_sub(reinterpret_cast<Vtype>(vx2), vmask));
+            vsum2 = vec_sum4s(vx, vsum2);
             if (CountK & 3) {
                 if (yval >= 12) {
                      *reinterpret_cast<vec_t *>(&D[64]) = vx;
@@ -225,10 +234,10 @@ MlasGemmQuantCopyPackA8x8(
         }
         if (y >= 1)
         {
-            Vtype a1 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            Vtype a2 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            Vtype a3 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            Vtype a4 = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype a1 = vmask;
+            Vtype a2 = vmask;
+            Vtype a3 = vmask;
+            Vtype a4 = vmask;
             a1[0] = a[0];
             a2[0] = a[lda];
             a3[0] = a[lda * 2];
@@ -246,20 +255,21 @@ MlasGemmQuantCopyPackA8x8(
                 a4[2] = a[lda * 3 + 2];
             }
             Vtype vx =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             Vtype vx1 =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
-            Vtype vx2 = vec_xxpermdi (vx, vx1, 0);
+            Vtype vx2 = vec_xxpermdi(vx, vx1, 0);
             vec_t vx3 =
-              reinterpret_cast<vec_t>(vec_sub (vx2, vmask));
-            vsum = vec_sum4s (vx3, vsum);
+              AIsSigned ? reinterpret_cast<vec_t>(vx2) :
+                          reinterpret_cast<vec_t>(vec_sub(vx2, vmask));
+            vsum = vec_sum4s(vx3, vsum);
             *reinterpret_cast<vec_t *>(&D[0]) = vx3;
-            a1 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            a2 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            a3 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            a4 = reinterpret_cast<Vtype>(vec_splats(Flip));
+            a1 = vmask;
+            a2 = vmask;
+            a3 = vmask;
+            a4 = vmask;
             a1[0] = a[lda * 4];
             a2[0] = a[lda * 5];
             a3[0] = a[lda * 6];
@@ -277,14 +287,15 @@ MlasGemmQuantCopyPackA8x8(
                 a4[2] = a[lda * 7 + 2];
             }
             vx =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             vx1 =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
-            vx2 = vec_xxpermdi (vx, vx1, 0);
-            vx3 = reinterpret_cast<vec_t>(vec_sub (vx2, vmask));
-            vsum2 = vec_sum4s (vx3, vsum2);
+            vx2 = vec_xxpermdi(vx, vx1, 0);
+            vx3 = AIsSigned ? reinterpret_cast<vec_t>(vx2) :
+                              reinterpret_cast<vec_t>(vec_sub(vx2, vmask));
+            vsum2 = vec_sum4s(vx3, vsum2);
             if (CountK % 16 >= 12) {
                 *reinterpret_cast<vec_t *>(&D[64]) = vx3;
                 D += 80;
@@ -327,34 +338,38 @@ MlasGemmQuantCopyPackA8x8(
             Vtype a3 = *reinterpret_cast<const Vtype *>(&a[lda * 2]);
             Vtype a4 = *reinterpret_cast<const Vtype *>(&a[lda * 3]);
             Vtype vx =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             Vtype vx1 =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
             Vtype vx2 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             Vtype vx3 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
-            Vtype vx4 = vec_xxpermdi (vx, vx1, 0);
-            Vtype vx5 = vec_xxpermdi (vx2, vx3, 0);
-            Vtype vx6 = vec_xxpermdi (vx, vx1, 3);
-            Vtype vx7 = vec_xxpermdi (vx2, vx3, 3);
+            Vtype vx4 = vec_xxpermdi(vx, vx1, 0);
+            Vtype vx5 = vec_xxpermdi(vx2, vx3, 0);
+            Vtype vx6 = vec_xxpermdi(vx, vx1, 3);
+            Vtype vx7 = vec_xxpermdi(vx2, vx3, 3);
             vec_t vx0 =
-              reinterpret_cast<vec_t>(vec_sub (vx4, vmask));
+              AIsSigned ? reinterpret_cast<vec_t>(vx4) :
+                          reinterpret_cast<vec_t>(vec_sub(vx4, vmask));
             *reinterpret_cast<vec_t *>(&D[0]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx5, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx5) :
+                              reinterpret_cast<vec_t>(vec_sub(vx5, vmask));
             *reinterpret_cast<vec_t *>(&D[16]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx6, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx6) :
+                              reinterpret_cast<vec_t>(vec_sub(vx6, vmask));
             *reinterpret_cast<vec_t *>(&D[32]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx7, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx7) :
+                              reinterpret_cast<vec_t>(vec_sub(vx7, vmask));
             *reinterpret_cast<vec_t *>(&D[48]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
+            vsum = vec_sum4s(vx0, vsum);
             D += 16 * 4;
             a += 16;
             y -= 16;
@@ -367,16 +382,17 @@ MlasGemmQuantCopyPackA8x8(
             int a4 = *reinterpret_cast<const int *>(&a[lda*3]);
             __vector int vx1 = { a1, a2, a3, a4};
             vec_t vx =
-              reinterpret_cast<vec_t>(vec_sub (reinterpret_cast<Vtype>(vx1), vmask));
+              AIsSigned ? reinterpret_cast<vec_t>(vx1) :
+                          reinterpret_cast<vec_t>(vec_sub(reinterpret_cast<Vtype>(vx1), vmask));
             *reinterpret_cast<vec_t *>(&D[0]) = vx;
-            vsum = vec_sum4s (vx, vsum);
+            vsum = vec_sum4s(vx, vsum);
             D += 16;
             a += 4;
             y -= 4;
         }
         if (y >= 1)
         {
-            Vtype vx = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype vx = vmask;
             vx[0] = a[0];
             vx[4] = a[lda];
             vx[8] = a[lda * 2];
@@ -394,9 +410,10 @@ MlasGemmQuantCopyPackA8x8(
                 vx[14] = a[lda * 3 + 2];
             }
             vec_t vx1 =
-               reinterpret_cast<vec_t>(vec_sub (vx, vmask));
+               AIsSigned ? reinterpret_cast<vec_t>(vx) :
+                           reinterpret_cast<vec_t>(vec_sub(vx, vmask));
             *reinterpret_cast<vec_t *>(&D[0]) = vx1;
-            vsum = vec_sum4s (vx1, vsum);
+            vsum = vec_sum4s(vx1, vsum);
             D += 16;
             a += 16;
         }
@@ -416,9 +433,9 @@ MlasGemmQuantCopyPackA8x8(
         __vector signed int vsum = { 0 };
 
         while (y >= 16) {
-            Vtype a4 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            Vtype a2 = reinterpret_cast<Vtype>(vec_splats(Flip));
-            Vtype a3 = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype a4 = vmask;
+            Vtype a2 = vmask;
+            Vtype a3 = vmask;
             Vtype a1 = *reinterpret_cast<const Vtype *>(&a[0]);
             if (CountM == 3) {
                 a3 = *reinterpret_cast<const Vtype *>(&a[lda * 2]);
@@ -427,53 +444,58 @@ MlasGemmQuantCopyPackA8x8(
                 a2 = *reinterpret_cast<const Vtype *>(&a[lda]);
             }
             Vtype vx =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             Vtype vx1 =
-              reinterpret_cast<Vtype>(vec_mergee (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergee(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
             Vtype vx2 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a1),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a1),
                                reinterpret_cast<__vector int>(a2)));
             Vtype vx3 =
-              reinterpret_cast<Vtype>(vec_mergeo (reinterpret_cast<__vector int>(a3),
+              reinterpret_cast<Vtype>(vec_mergeo(reinterpret_cast<__vector int>(a3),
                                reinterpret_cast<__vector int>(a4)));
-            Vtype vx4 = vec_xxpermdi (vx, vx1, 0);
-            Vtype vx5 = vec_xxpermdi (vx2, vx3, 0);
-            Vtype vx6 = vec_xxpermdi (vx, vx1, 3);
-            Vtype vx7 = vec_xxpermdi (vx2, vx3, 3);
+            Vtype vx4 = vec_xxpermdi(vx, vx1, 0);
+            Vtype vx5 = vec_xxpermdi(vx2, vx3, 0);
+            Vtype vx6 = vec_xxpermdi(vx, vx1, 3);
+            Vtype vx7 = vec_xxpermdi(vx2, vx3, 3);
             vec_t vx0 =
-              reinterpret_cast<vec_t>(vec_sub (vx4, vmask));
+              AIsSigned ? reinterpret_cast<vec_t>(vx4) :
+                          reinterpret_cast<vec_t>(vec_sub(vx4, vmask));
             *reinterpret_cast<vec_t *>(&D[0]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx5, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx5) :
+                              reinterpret_cast<vec_t>(vec_sub(vx5, vmask));
             *reinterpret_cast<vec_t *>(&D[16]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx6, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx6) :
+                              reinterpret_cast<vec_t>(vec_sub(vx6, vmask));
             *reinterpret_cast<vec_t *>(&D[32]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
-            vx0 = reinterpret_cast<vec_t>(vec_sub (vx7, vmask));
+            vsum = vec_sum4s(vx0, vsum);
+            vx0 = AIsSigned ? reinterpret_cast<vec_t>(vx7) :
+                              reinterpret_cast<vec_t>(vec_sub(vx7, vmask));
             *reinterpret_cast<vec_t *>(&D[48]) = vx0;
-            vsum = vec_sum4s (vx0, vsum);
+            vsum = vec_sum4s(vx0, vsum);
             D += 16 * 4;
             a += 16;
             y -= 16;
         }
         while (y >= 4)
         {
-            Vtype vb = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype vb = vmask;
             __vector int vx1 = reinterpret_cast<__vector int>(vb);
             vx1[0] = *reinterpret_cast<const int *>(&a[0]);
-            if(CountM >= 2) {
+            if (CountM >= 2) {
                 vx1[1] = *reinterpret_cast<const int *>(&a[lda]);
             }
-            if(CountM >= 3) {
+            if (CountM >= 3) {
                 vx1[2] = *reinterpret_cast<const int *>(&a[lda*2]);
             }
             vec_t vx =
-              reinterpret_cast<vec_t>(vec_sub (reinterpret_cast<Vtype>(vx1), vmask));
+              AIsSigned ? reinterpret_cast<vec_t>(vx1) :
+                          reinterpret_cast<vec_t>(vec_sub(reinterpret_cast<Vtype>(vx1), vmask));
             *reinterpret_cast<vec_t *>(&D[0]) = vx;
-            vsum = vec_sum4s (vx, vsum);
+            vsum = vec_sum4s(vx, vsum);
             D += 16;
             a += 4;
             y -= 4;
@@ -508,7 +530,7 @@ MlasGemmQuantCopyPackA8x8(
                 }
             }
             *reinterpret_cast<vec_t *>(&D[0]) = vx;
-            vsum = vec_sum4s (vx, vsum);
+            vsum = vec_sum4s(vx, vsum);
             D += 16;
         }
         *RowSumBuffer++ = vsum[0];
@@ -521,7 +543,7 @@ MlasGemmQuantCopyPackA8x8(
     }
 }
 
-template<typename Vtype>
+template<typename Vtype, bool BIsSigned>
 void
 MlasGemmQuantCopyPackB8x8(
     MLAS_GEMM_QUANT_KERNEL_POWER10::PackedBType* D,
@@ -529,29 +551,128 @@ MlasGemmQuantCopyPackB8x8(
     size_t ldb,
     size_t CountN,
     size_t CountK,
-    int32_t* ColumnSumBuffer,
-    bool BIsSigned
+    int32_t* ColumnSumBuffer
     )
 {
-    const uint8_t BitFlipValue = (BIsSigned ? 0x80 : 0);
+    [[maybe_unused]] constexpr uint8_t BitFlipValue = (BIsSigned ? 0x80 : 0);
     typedef __vector unsigned char vec_t;
     Vtype vmask = reinterpret_cast<Vtype>(vec_splats(BitFlipValue));
     vec_t mask = {0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15};
-    const int8_t Flip = (BIsSigned ? -128 : 0);
 
-    // Process 4 columns of matrix B in a loop.
-    //
     // Copy columns from matrix B to the packed buffer. Signed buffers are
     // converted to unsigned buffers in order to share a common kernel.
     //
     // If CountK is not aligned to a multiple of four, then the packed buffer
     // is padded with zero vectors.
-    while (CountN >= 4) {
 
+    // Process 16 columns of matrix B in a loop.
+    //
+    size_t PackedK = ((CountK + 4 - 1) / 4) * 16;
+    size_t k2 = PackedK;
+    size_t k3 = PackedK*2;
+    size_t k4 = PackedK*3;
+
+    while (CountN >= 16) {
         const uint8_t* b = B;
         __vector unsigned int vsum = {0};
+        __vector unsigned int vsum2 = {0};
+        __vector unsigned int vsum3 = {0};
+        __vector unsigned int vsum4 = {0};
         size_t y = CountK;
-        if(y >= 4) {
+        if (y >= 4) {
+            do {
+                Vtype b1 = *reinterpret_cast<const Vtype *>(&b[0]);
+                Vtype b2 = *reinterpret_cast<const Vtype *>(&b[ldb]);
+                Vtype b3 = *reinterpret_cast<const Vtype *>(&b[ldb*2]);
+                Vtype b4 = *reinterpret_cast<const Vtype *>(&b[ldb*3]);
+                Vtype t1 = vec_mergeh(b1, b3);
+                Vtype t2 = vec_mergel(b1, b3);
+                Vtype t3 = vec_mergeh(b2, b4);
+                Vtype t4 = vec_mergel(b2, b4);
+                b1 = vec_mergeh(t1, t3);
+                b2 = vec_mergel(t1, t3);
+                b3 = vec_mergeh(t2, t4);
+                b4 = vec_mergel(t2, t4);
+                vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b1, vmask)) :
+                                        reinterpret_cast<vec_t>(b1);
+                vec_t vx2 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b2, vmask)) :
+                                        reinterpret_cast<vec_t>(b2);
+                vec_t vx3 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b3, vmask)) :
+                                        reinterpret_cast<vec_t>(b3);
+                vec_t vx4 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b4, vmask)) :
+                                        reinterpret_cast<vec_t>(b4);
+                *reinterpret_cast<vec_t *>(&D[0]) = vx1;
+                *reinterpret_cast<vec_t *>(&D[k2]) = vx2;
+                *reinterpret_cast<vec_t *>(&D[k3]) = vx3;
+                *reinterpret_cast<vec_t *>(&D[k4]) = vx4;
+                vsum = vec_sum4s(vx1, vsum);
+                vsum2 = vec_sum4s(vx2, vsum2);
+                vsum3 = vec_sum4s(vx3, vsum3);
+                vsum4 = vec_sum4s(vx4, vsum4);
+                D += 16;
+                b += ldb*4;
+                y -= 4;
+            } while (y >= 4);
+        }
+        if (y >= 1) {
+            Vtype b1 = *reinterpret_cast<const Vtype *>(&b[0]);
+            Vtype b2 = (y >= 2) ? *reinterpret_cast<const Vtype *>(&b[ldb]) : vmask;
+            Vtype b3 = (y >= 3) ? *reinterpret_cast<const Vtype *>(&b[ldb*2]) : vmask;
+            Vtype b4 = vmask;
+            Vtype t1 = vec_mergeh(b1, b3);
+            Vtype t2 = vec_mergel(b1, b3);
+            Vtype t3 = vec_mergeh(b2, b4);
+            Vtype t4 = vec_mergel(b2, b4);
+            b1 = vec_mergeh(t1, t3);
+            b2 = vec_mergel(t1, t3);
+            b3 = vec_mergeh(t2, t4);
+            b4 = vec_mergel(t2, t4);
+            vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b1, vmask)) :
+                                    reinterpret_cast<vec_t>(b1);
+            vec_t vx2 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b2, vmask)) :
+                                    reinterpret_cast<vec_t>(b2);
+            vec_t vx3 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b3, vmask)) :
+                                    reinterpret_cast<vec_t>(b3);
+            vec_t vx4 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(b4, vmask)) :
+                                    reinterpret_cast<vec_t>(b4);
+            *reinterpret_cast<vec_t *>(&D[0]) = vx1;
+            *reinterpret_cast<vec_t *>(&D[k2]) = vx2;
+            *reinterpret_cast<vec_t *>(&D[k3]) = vx3;
+            *reinterpret_cast<vec_t *>(&D[k4]) = vx4;
+            vsum = vec_sum4s(vx1, vsum);
+            vsum2 = vec_sum4s(vx2, vsum2);
+            vsum3 = vec_sum4s(vx3, vsum3);
+            vsum4 = vec_sum4s(vx4, vsum4);
+            D += 16;
+        }
+        *ColumnSumBuffer++ = vsum[0];
+        *ColumnSumBuffer++ = vsum[1];
+        *ColumnSumBuffer++ = vsum[2];
+        *ColumnSumBuffer++ = vsum[3];
+        *ColumnSumBuffer++ = vsum2[0];
+        *ColumnSumBuffer++ = vsum2[1];
+        *ColumnSumBuffer++ = vsum2[2];
+        *ColumnSumBuffer++ = vsum2[3];
+        *ColumnSumBuffer++ = vsum3[0];
+        *ColumnSumBuffer++ = vsum3[1];
+        *ColumnSumBuffer++ = vsum3[2];
+        *ColumnSumBuffer++ = vsum3[3];
+        *ColumnSumBuffer++ = vsum4[0];
+        *ColumnSumBuffer++ = vsum4[1];
+        *ColumnSumBuffer++ = vsum4[2];
+        *ColumnSumBuffer++ = vsum4[3];
+        B += 16;
+        CountN -= 16;
+        D += k4;
+    }
+
+    // Process four columns of matrix B in a loop.
+    //
+    while (CountN >= 4) {
+        const uint8_t* b = B;
+        __vector unsigned int vsum = {0};
+        size_t y = CountK;
+        if (y >= 4) {
             do {
                 int b1 = *reinterpret_cast<const int *>(&b[0]);
                 int b2 = *reinterpret_cast<const int *>(&b[ldb]);
@@ -559,28 +680,30 @@ MlasGemmQuantCopyPackB8x8(
                 int b4 = *reinterpret_cast<const int *>(&b[ldb*3]);
                 __vector int vb = {b1, b2, b3, b4};
                 Vtype vx = vec_perm(reinterpret_cast<Vtype>(vb), reinterpret_cast<Vtype>(vb), mask);
-                vec_t vx1 = reinterpret_cast<vec_t>(vec_add (vx, vmask));
+                vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(vx, vmask)) :
+                                        reinterpret_cast<vec_t>(vx);
                 *reinterpret_cast<vec_t *>(&D[0]) = vx1;
-                vsum = vec_sum4s (vx1, vsum);
+                vsum = vec_sum4s(vx1, vsum);
                 D += 16;
                 b += ldb*4;
                 y -= 4;
             } while (y >= 4);
         }
         if (y >= 1) {
-            Vtype vb = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype vb = vmask;
             __vector int vb1 = reinterpret_cast<__vector int>(vb);
             vb1[0] = *reinterpret_cast<const int *>(&b[0]);
-            if( y >= 2) {
+            if (y >= 2) {
                 vb1[1] = *reinterpret_cast<const int *>(&b[ldb]);
             }
-            if( y >= 3) {
+            if (y >= 3) {
                 vb1[2] = *reinterpret_cast<const int *>(&b[ldb*2]);
             }
             Vtype vx = vec_perm(reinterpret_cast<Vtype>(vb1), reinterpret_cast<Vtype>(vb1), mask);
-            vec_t vx1 = reinterpret_cast<vec_t>(vec_add (vx, vmask));
+            vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(vx, vmask)) :
+                                    reinterpret_cast<vec_t>(vx);
             *reinterpret_cast<vec_t *>(&D[0]) = vx1;
-            vsum = vec_sum4s (vx1, vsum);
+            vsum = vec_sum4s(vx1, vsum);
             D += 16;
         }
         *ColumnSumBuffer++ = vsum[0];
@@ -600,7 +723,7 @@ MlasGemmQuantCopyPackB8x8(
         size_t y = CountK;
         if (y >= 4) {
             do {
-                Vtype vb = reinterpret_cast<Vtype>(vec_splats(Flip));
+                Vtype vb = vmask;
                 if (CountN == 1) {
                     vb[0] = b[0];
                     vb[4] = b[ldb];
@@ -632,16 +755,17 @@ MlasGemmQuantCopyPackB8x8(
                     vb[14] = b[ldb*3+2];
                 }
                 Vtype vx = vec_perm(reinterpret_cast<Vtype>(vb), reinterpret_cast<Vtype>(vb), mask);
-                vec_t vx1 = reinterpret_cast<vec_t>(vec_add (vx, vmask));
+                vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(vx, vmask)) :
+                                        reinterpret_cast<vec_t>(vx);
                 *reinterpret_cast<vec_t *>(&D[0]) = vx1;
-                vsum = vec_sum4s (vx1, vsum);
+                vsum = vec_sum4s(vx1, vsum);
                 D += 16;
                 b += ldb*4;
                 y -= 4;
             } while (y >= 4);
         }
         if (y >= 1) {
-            Vtype vb = reinterpret_cast<Vtype>(vec_splats(Flip));
+            Vtype vb = vmask;
             if (CountN == 1) {
                 vb[0]= b[0];
                 if (y >= 2) {
@@ -679,9 +803,10 @@ MlasGemmQuantCopyPackB8x8(
                 }
             }
             Vtype vx = vec_perm(reinterpret_cast<Vtype>(vb), reinterpret_cast<Vtype>(vb), mask);
-            vec_t vx1 = reinterpret_cast<vec_t>(vec_add (vx, vmask));
+            vec_t vx1 = BIsSigned ? reinterpret_cast<vec_t>(vec_add(vx, vmask)) :
+                                    reinterpret_cast<vec_t>(vx);
             *reinterpret_cast<vec_t *>(&D[0]) = vx1;
-            vsum = vec_sum4s (vx1, vsum);
+            vsum = vec_sum4s(vx1, vsum);
             D += 16;
         }
         *ColumnSumBuffer++ = vsum[0];
@@ -707,9 +832,9 @@ MlasGemmQuantCopyPackA<MLAS_GEMM_QUANT_KERNEL_POWER10>(
     )
 {
     if (AIsSigned) {
-        MlasGemmQuantCopyPackA8x8<__vector signed char>(D, A, lda, CountM, CountK, RowSumBuffer, AIsSigned);
+        MlasGemmQuantCopyPackA8x8<__vector signed char, true>(D, A, lda, CountM, CountK, RowSumBuffer);
     } else {
-        MlasGemmQuantCopyPackA8x8<__vector unsigned char>(D, A, lda, CountM, CountK, RowSumBuffer, AIsSigned);
+        MlasGemmQuantCopyPackA8x8<__vector unsigned char, false>(D, A, lda, CountM, CountK, RowSumBuffer);
     }
 }
 template<>
@@ -725,9 +850,9 @@ MlasGemmQuantCopyPackB<MLAS_GEMM_QUANT_KERNEL_POWER10>(
     )
 {
     if (BIsSigned) {
-        MlasGemmQuantCopyPackB8x8<__vector signed char>(D, B, ldb, CountN, CountK, ColumnSumBuffer, BIsSigned);
+        MlasGemmQuantCopyPackB8x8<__vector signed char, true>(D, B, ldb, CountN, CountK, ColumnSumBuffer);
     } else {
-        MlasGemmQuantCopyPackB8x8< __vector unsigned char>(D, B, ldb, CountN, CountK, ColumnSumBuffer, BIsSigned);
+        MlasGemmQuantCopyPackB8x8< __vector unsigned char, false>(D, B, ldb, CountN, CountK, ColumnSumBuffer);
     }
 }
 
@@ -747,46 +872,93 @@ MlasQgemmStoreVectorMMA
     int pos
     )
 {
-    __vector int *rowC;
-    __vector signed int vsum = {0};
+    size_t RowCount;
+    __vector signed int vsum0, vsum1, vsum2, vsum3;
+    __vector signed int columnsum = *reinterpret_cast<const __vector int32_t *>(&ColumnSumBuffer[pos]);
+    C += VectorCount;
     if (ZeroPointB != nullptr) {
+        __vector signed int zeropoint = *reinterpret_cast<const __vector int32_t *>(&ZeroPointB[pos]);
         if (ZeroMode) {
-            for (size_t RowCount = 0;RowCount < row; RowCount++){
-                vsum[0] = RowSumBuffer[RowCount] * ZeroPointB[pos] + ColumnSumBuffer[pos];
-                vsum[1] = RowSumBuffer[RowCount] * ZeroPointB[pos+1] + ColumnSumBuffer[pos+1];
-                vsum[2] = RowSumBuffer[RowCount] * ZeroPointB[pos+2] + ColumnSumBuffer[pos+2];
-                vsum[3] = RowSumBuffer[RowCount] * ZeroPointB[pos+3] + ColumnSumBuffer[pos+3];
-                rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]);
-                rowC[0] = *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum;
+            for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum;
+                vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) * zeropoint + columnsum;
+                vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) * zeropoint + columnsum;
+                vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) * zeropoint + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
+                *reinterpret_cast<__vector int *>(&C[ldc]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1;
+                *reinterpret_cast<__vector int *>(&C[ldc*2]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2;
+                *reinterpret_cast<__vector int *>(&C[ldc*3]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3;
+            }
+            for (; RowCount < row; RowCount++, C += ldc) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount]) * zeropoint + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
             }
         } else {
-            for (size_t RowCount = 0;RowCount < row; RowCount++){
-                vsum[0] = RowSumBuffer[RowCount] * ZeroPointB[pos] + ColumnSumBuffer[pos];
-                vsum[1] = RowSumBuffer[RowCount] * ZeroPointB[pos+1] + ColumnSumBuffer[pos+1];
-                vsum[2] = RowSumBuffer[RowCount] * ZeroPointB[pos+2] + ColumnSumBuffer[pos+2];
-                vsum[3] = RowSumBuffer[RowCount] * ZeroPointB[pos+3] + ColumnSumBuffer[pos+3];
-                rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]);
-                rowC[0] += *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum;
+            for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum;
+                vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) * zeropoint + columnsum;
+                vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) * zeropoint + columnsum;
+                vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) * zeropoint + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
+                *reinterpret_cast<__vector int *>(&C[ldc]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1;
+                *reinterpret_cast<__vector int *>(&C[ldc*2]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2;
+                *reinterpret_cast<__vector int *>(&C[ldc*3]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3;
+            }
+            for (; RowCount < row; RowCount++, C += ldc) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount]) * zeropoint + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
             }
         }
     } else {
         if (ZeroMode) {
-            for (size_t RowCount = 0;RowCount < row; RowCount++){
-                vsum[0] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos];
-                vsum[1] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+1];
-                vsum[2] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+2];
-                vsum[3] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+3];
-                rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]);
-                rowC[0] = *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum;
+            for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) + columnsum;
+                vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) + columnsum;
+                vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) + columnsum;
+                vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
+                *reinterpret_cast<__vector int *>(&C[ldc]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1;
+                *reinterpret_cast<__vector int *>(&C[ldc*2]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2;
+                *reinterpret_cast<__vector int *>(&C[ldc*3]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3;
+            }
+            for (; RowCount < row; RowCount++, C += ldc) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount]) + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) =
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
             }
         } else {
-            for (size_t RowCount = 0;RowCount < row; RowCount++){
-                vsum[0] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos];
-                vsum[1] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+1];
-                vsum[2] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+2];
-                vsum[3] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+3];
-                rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]);
-                rowC[0] += *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum;
+            for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) + columnsum;
+                vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) + columnsum;
+                vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) + columnsum;
+                vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
+                *reinterpret_cast<__vector int *>(&C[ldc]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1;
+                *reinterpret_cast<__vector int *>(&C[ldc*2]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2;
+                *reinterpret_cast<__vector int *>(&C[ldc*3]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3;
+            }
+            for (; RowCount < row; RowCount++, C += ldc) {
+                vsum0 = vec_splats(RowSumBuffer[RowCount]) + columnsum;
+                *reinterpret_cast<__vector int *>(&C[0]) +=
+                    *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0;
             }
         }
     }
@@ -846,36 +1018,36 @@ MlasQgemmComputeMMA(
     )
 {
     if (CountK == 16) {
-        __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]);
-        __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]);
-        __builtin_mma_xvi8ger4pp (acc0, va[2], vb[2]);
-        __builtin_mma_xvi8ger4pp (acc0, va[3], vb[3]);
+        __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]);
+        __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]);
+        __builtin_mma_xvi8ger4pp(acc0, va[2], vb[2]);
+        __builtin_mma_xvi8ger4pp(acc0, va[3], vb[3]);
         if (CountM) {
-            __builtin_mma_xvi8ger4pp (acc1, va[4], vb[0]);
-            __builtin_mma_xvi8ger4pp (acc1, va[5], vb[1]);
-            __builtin_mma_xvi8ger4pp (acc1, va[6], vb[2]);
-            __builtin_mma_xvi8ger4pp (acc1, va[7], vb[3]);
+            __builtin_mma_xvi8ger4pp(acc1, va[4], vb[0]);
+            __builtin_mma_xvi8ger4pp(acc1, va[5], vb[1]);
+            __builtin_mma_xvi8ger4pp(acc1, va[6], vb[2]);
+            __builtin_mma_xvi8ger4pp(acc1, va[7], vb[3]);
         }
     } else if (CountK == 12) {
-        __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]);
-        __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]);
-        __builtin_mma_xvi8ger4pp (acc0, va[2], vb[2]);
+        __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]);
+        __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]);
+        __builtin_mma_xvi8ger4pp(acc0, va[2], vb[2]);
         if (CountM) {
-            __builtin_mma_xvi8ger4pp (acc1, va[3], vb[0]);
-            __builtin_mma_xvi8ger4pp (acc1, va[4], vb[1]);
-            __builtin_mma_xvi8ger4pp (acc1, va[5], vb[2]);
+            __builtin_mma_xvi8ger4pp(acc1, va[3], vb[0]);
+            __builtin_mma_xvi8ger4pp(acc1, va[4], vb[1]);
+            __builtin_mma_xvi8ger4pp(acc1, va[5], vb[2]);
         }
     } else if (CountK == 8) {
-        __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]);
-        __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]);
+        __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]);
+        __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]);
         if (CountM) {
-            __builtin_mma_xvi8ger4pp (acc1, va[2], vb[0]);
-            __builtin_mma_xvi8ger4pp (acc1, va[3], vb[1]);
+            __builtin_mma_xvi8ger4pp(acc1, va[2], vb[0]);
+            __builtin_mma_xvi8ger4pp(acc1, va[3], vb[1]);
         }
     } else {
-        __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]);
+        __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]);
         if (CountM) {
-            __builtin_mma_xvi8ger4pp (acc1, va[1], vb[0]);
+            __builtin_mma_xvi8ger4pp(acc1, va[1], vb[0]);
         }
     }
 };
@@ -902,7 +1074,7 @@ MlasGemmQuantKernel<MLAS_GEMM_QUANT_KERNEL_POWER10>(
     if (Mval >= 8) {
         Mval = 4;
     }
-    while(CountN > 0) {
+    while (CountN > 0) {
         const int8_t *a = A;
         typedef __vector unsigned char vec_t;
         const uint8_t *b = B;
@@ -1057,23 +1229,23 @@ MlasGemmQuantKernel<MLAS_GEMM_QUANT_KERNEL_POWER10>(
         }
         // Store matrix C with accumulator result.
         if (CountN >=16) {
-            __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc0);
+            __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc0);
             MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0);
-            __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc1);
+            __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc1);
             MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4);
-            __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc2);
+            __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc2);
             MlasQgemmStoreVectorMMA<8>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 8);
-            __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc3);
+            __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc3);
             MlasQgemmStoreVectorMMA<12>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 12);
             if (CountM >= 8) {
                 C1 = C+ldc*4;
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc4);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc4);
                 MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc5);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc5);
                 MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc6);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc6);
                 MlasQgemmStoreVectorMMA<8>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 8);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc7);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc7);
                 MlasQgemmStoreVectorMMA<12>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 12);
             }
             INC_BUFFER(16);
@@ -1082,72 +1254,72 @@ MlasGemmQuantKernel<MLAS_GEMM_QUANT_KERNEL_POWER10>(
             C += 16;
         } else {
             if (CountN >=12 ) {
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc0);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc0);
                 MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc1);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc1);
                 MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc2);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc2);
                 MlasQgemmStoreVectorMMA<8>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 8);
                 if (CountM >= 8) {
                     C1 = C+ldc*4;
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc4);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc4);
                     MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0);
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc5);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc5);
                     MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4);
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc6);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc6);
                     MlasQgemmStoreVectorMMA<8>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 8);
                 }
                 INC_BUFFER(12);
                 if (CountN - 12 > 0) {
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc3);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc3);
                     if (CountM >= 8) {
-                        __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result1), &acc7);
+                        __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result1), &acc7);
                     }
                 }
                 CountN -= 12;
                 C += 12;
             } else if (CountN >= 8) {
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc0);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc0);
                 MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0);
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc1);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc1);
                 MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4);
                 if (CountM >= 8) {
                     C1 = C+ldc*4;
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc4);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc4);
                     MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0);
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc5);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc5);
                     MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4);
                 }
                 INC_BUFFER(8);
                 if (CountN - 8 > 0) {
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc2);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc2);
                     if (CountM >= 8) {
-                        __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result1), &acc6);
+                        __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result1), &acc6);
                     }
                 }
                 CountN -= 8;
                 C += 8;
             } else if (CountN >= 4) {
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc0);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc0);
                 MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0);
                 if (CountM >= 8) {
                     C1 = C+ldc*4;
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc4);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc4);
                     MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0);
                     if (CountN - 4 > 0) {
-                        __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result1), &acc5);
+                        __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result1), &acc5);
                     }
                 }
                 INC_BUFFER(4);
                 if (CountN - 4 > 0) {
-                     __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc1);
+                     __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc1);
                 }
                 CountN -= 4;
                 C += 4;
             } else {
-                __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result), &acc0);
+                __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result), &acc0);
                 if (CountM >= 8) {
-                    __builtin_mma_disassemble_acc (reinterpret_cast<void*>(result1), &acc4);
+                    __builtin_mma_disassemble_acc(reinterpret_cast<void*>(result1), &acc4);
                 }
             }
             CountN &= 3;

From df28c7d73b72440f115ccf80f3840ea0ca5bb3a9 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 5 Jun 2024 16:48:40 -0700
Subject: [PATCH 19/26] [Quant tool] Improve performance of int4 weight
 quantization (#20935)

### Description
- Uses our own quantization functions instead of the ONNX reference
implementation of QuantizeLinear when quantizing weights to int4.
- Uses a custom function that packs bytes into 4-bit elements.


### Motivation and Context
Running the quantization tool to create QDQ models with int4 weights
could take up to 7x longer. This PR uses our own quantization and byte
packing utilities to improve performance.

#### Measurements
Model with ~5M parameters to quantize to int4.

- Current implementation: **84.5s**
- Only replace ONNX QuantizeLinear implementation: **50.3s** (1.68x
speedup)
- This PR (replace onnx Q impl, custom packing func): **13.5s** (6.26x
speedup)

---------

Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
---
 .../tools/quantization/base_quantizer.py      | 39 ++++++----
 .../python/tools/quantization/quant_utils.py  | 78 +++++++++++--------
 .../python/quantization/test_quant_util.py    | 69 +++++++++++++++-
 3 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index 74e213fa61362..06d2ce30b9b37 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -25,6 +25,7 @@
     find_by_name,
     model_has_infer_metadata,
     normalize_axis,
+    pack_bytes_to_4bit,
     quantize_data,
     quantize_nparray,
     save_and_reload_model_with_shape_infer,
@@ -340,13 +341,17 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa
                             f"\nraw={str(q_weight_initializer)[:200]}."
                         )
             elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
-                # TODO: Use simpler make_tensor call when ONNX bug that does not store negative weights packed
-                # within int32_data is fixed.
-                # q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, q_weight_data)
-                packed_data = onnx.helper.pack_float32_to_4bit(q_weight_data.flatten(), qType == onnx.TensorProto.INT4)
-                q_weight_initializer = onnx.helper.make_tensor(
-                    q_weight_name, qType, weight.dims, packed_data.tobytes(), raw=True
-                )
+                if q_weight_data.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
+                q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True)
             else:
                 q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
                     weight.dims
@@ -483,16 +488,18 @@ def quantize_weight_per_channel_impl(
 
         if not keep_float_weight:
             if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4):
-                # TODO: Use simpler make_tensor call when ONNX bug that does not store negative weights packed
-                # within int32_data is fixed.
-                # q_weight_initializer = onnx.helper.make_tensor(
-                #     q_weight_name, weight_qType, weights_shape, quantized_weights
-                # )
-                packed_data = onnx.helper.pack_float32_to_4bit(
-                    quantized_weights.flatten(), weight_qType == onnx.TensorProto.INT4
-                )
+                if quantized_weights.dtype not in (np.int8, np.uint8):
+                    raise RuntimeError(
+                        f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values."
+                    )
+
+                # We do not use onnx.helper.pack_float32_to_4bit() due to performance.
+                # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes.
+                packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes()))
+
+                # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161
                 q_weight_initializer = onnx.helper.make_tensor(
-                    q_weight_name, weight_qType, weights_shape, packed_data.tobytes(), raw=True
+                    q_weight_name, weight_qType, weights_shape, packed_data, raw=True
                 )
                 self.model.initializer_extend([q_weight_initializer])
             else:
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index bdf6d5a355206..53d2eaeaba70b 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -21,10 +21,18 @@
 from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
 
 try:
-    from onnx.reference.custom_element_types import float8e4m3fn, int4, uint4
+    from onnx.reference.custom_element_types import float8e4m3fn
 except ImportError:
     float8e4m3fn = None
 
+# INT4 np.dtypes added in ONNX 1.16. These map to np.int8/np.uint8 because numpy
+# does not support sub-byte types.
+try:
+    from onnx.reference.custom_element_types import int4, uint4
+except ImportError:
+    int4 = None
+    uint4 = None
+
 
 __producer__ = "onnx.quantize"
 __version__ = "0.1.0"
@@ -134,8 +142,8 @@ def from_string(format):
     onnx_proto.TensorProto.INT16: numpy.dtype("int16"),
     onnx_proto.TensorProto.UINT16: numpy.dtype("uint16"),
     onnx_proto.TensorProto.FLOAT8E4M3FN: float8e4m3fn,
-    onnx_proto.TensorProto.INT4: int4,
-    onnx_proto.TensorProto.UINT4: uint4,
+    onnx_proto.TensorProto.INT4: int4,  # base_dtype is np.int8
+    onnx_proto.TensorProto.UINT4: uint4,  # base_dtype is np.uint8
 }
 
 ONNX_INT_TYPE_RANGE = {
@@ -212,36 +220,12 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
         )
         ref = ReferenceEvaluator(onnx_model)
         return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
-    elif qType in (
-        onnx_proto.TensorProto.INT4,
-        onnx_proto.TensorProto.UINT4,
-    ):
-        if arr.dtype == numpy.float32:
-            onnx_type = TensorProto.FLOAT
-        elif arr.dtype == numpy.float16:
-            onnx_type = TensorProto.FLOAT16
-        else:
-            raise ValueError(f"Unexpected dtype {arr.dtype}.")
-        onnx_model = make_model(
-            make_graph(
-                [
-                    make_node("QuantizeLinear", ["X", "scale", "zero_point"], ["Y"]),
-                ],
-                "qu",
-                [
-                    make_tensor_value_info("X", onnx_type, None),
-                    make_tensor_value_info("scale", onnx_type, None),
-                    make_tensor_value_info("zero_point", qType, None),
-                ],
-                [make_tensor_value_info("Y", qType, None)],
-            )
-        )
-        # The reference ONNX implementation of QuantizeLinear<int4> returns "unpacked" int8 numpy values
-        # because numpy cannot represent 4bit values (although ONNX TensorProto has no problem with this).
-        # These "unpacked" int8 values are correctly re-packed when passed to onnx.make_tensor().
-        ref = ReferenceEvaluator(onnx_model)
-        return _check_type(ref.run(None, {"X": arr, "scale": scale, "zero_point": zero_point})[0])
     else:
+        # Quantizes data for all integer types.
+        #
+        # For int4 types, the quantized data is returned as either np.int8 or np.uint8,
+        # which matches the python reference ONNX implementation of QuantizeLinear.
+        # This data can be packed into 4-bit elements by using pack_bytes_to_4bit().
         dtype = ONNX_TYPE_TO_NP_TYPE[qType]
         (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
 
@@ -482,6 +466,36 @@ def normalize_axis(axis: int, rank: int) -> tuple[bool, int]:
     return is_valid, axis_norm
 
 
+def pack_bytes_to_4bit(src_8bit: bytes) -> bytearray:
+    """
+    Copies a source array of 8-bit values into a destination bytearray of packed 4-bit values.
+    Assumes that the source values are already in the appropriate int4 range.
+    :parameter src_8bit: The 8-bit element values to pack.
+    :return A bytearray with every two 8-bit src elements packed into a single byte.
+    """
+    num_elems = len(src_8bit)
+    if num_elems == 0:
+        return bytearray()
+
+    dst_size = (num_elems + 1) // 2  # Ex: 5 8-bit elems packed into 3 bytes
+    dst = bytearray(dst_size)
+
+    src_i: int = 0
+    dst_i: int = 0
+
+    # Pack two 8-bit elements into a single byte in each iteration.
+    while src_i < num_elems - 1:
+        dst[dst_i] = ((src_8bit[src_i + 1] & 0xF) << 4) | (src_8bit[src_i] & 0xF)
+        dst_i += 1
+        src_i += 2
+
+    if src_i < num_elems:
+        # Odd number of elements.
+        dst[dst_i] = src_8bit[src_i] & 0xF
+
+    return dst
+
+
 class QuantizedInitializer:
     """
     Represents a linearly quantized weight input from ONNX operators
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 848857ceb279d..7b3fc08982ac1 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -13,7 +13,13 @@
 import onnx
 from onnx import TensorProto, helper, numpy_helper
 
-from onnxruntime.quantization.quant_utils import compute_scale_zp, load_model_with_shape_infer, model_has_infer_metadata
+from onnxruntime.quantization.quant_utils import (
+    compute_scale_zp,
+    load_model_with_shape_infer,
+    model_has_infer_metadata,
+    pack_bytes_to_4bit,
+    quantize_data,
+)
 
 
 class TestQuantUtil(unittest.TestCase):
@@ -101,6 +107,67 @@ def test_load_external_model(self):
             model_reloaded = load_model_with_shape_infer(Path(model_file_path))
             self.assertTrue(model_has_infer_metadata(model_reloaded))
 
+    def test_pack_bytes_to_4bit(self):
+        """
+        Tests the pack_bytes_to_4bit() utility.
+        """
+        subtest_configs = [
+            (-8, 6, True),  # Odd num elems, signed
+            (-8, 7, True),  # Even num elems, signed
+            (0, 14, False),  # Odd num elems, unsigned
+            (0, 15, False),  # Even num elems, unsigned
+        ]
+        for min_val, max_val, signed in subtest_configs:
+            with self.subTest(min_val=min_val, max_val=max_val, signed=signed):
+                src_float = numpy.arange(min_val, max_val + 1).astype(numpy.float32)
+                src_int = src_float.astype(numpy.int8 if signed else numpy.uint8)
+
+                actual_packed_vals = bytes(pack_bytes_to_4bit(src_int.tobytes()))
+                expected_packed_vals = onnx.helper.pack_float32_to_4bit(src_float, signed).tobytes()
+                self.assertEqual(actual_packed_vals, expected_packed_vals)
+
+    def test_quantize_data_4bit(self):
+        """
+        Test that calling quantize_data for int4 quantization returns data of the correct type and range.
+        """
+        data_float = numpy.arange(-20, 17).astype(numpy.float32)
+
+        subtest_configs = [
+            (onnx.TensorProto.INT4, True),  # int4, symmetric quant
+            (onnx.TensorProto.INT4, False),  # int4, symmetric quant
+            (onnx.TensorProto.UINT4, True),  # uint4, symmetric quant
+            (onnx.TensorProto.UINT4, False),  # uint4, symmetric quant
+        ]
+
+        for onnx_type, symmetric in subtest_configs:
+            with self.subTest(onnx_type=onnx_type, symmetric=symmetric):
+                _, _, zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric)
+                is_signed = onnx_type == onnx.TensorProto.INT4
+                np_int_type = numpy.int8 if is_signed else numpy.uint8
+                qmin = numpy.array(-8 if is_signed else 0, dtype=np_int_type)
+                qmax = numpy.array(7 if is_signed else 15, dtype=np_int_type)
+
+                self.assertEqual(zero_point.dtype, np_int_type)
+                self.assertEqual(scale.dtype, data_float.dtype)
+
+                expected_zp, expected_scale = compute_scale_zp(
+                    data_float.min(), data_float.max(), qmin, qmax, symmetric=symmetric
+                )
+                self.assertEqual(zero_point, expected_zp)
+                self.assertEqual(scale, expected_scale)
+
+                # Even int4 quantization generates 8-bit numpy values.
+                self.assertEqual(data_quant.dtype, np_int_type)
+                for index, actual_quant_val in enumerate(data_quant.flatten()):
+                    self.assertTrue(actual_quant_val >= qmin and actual_quant_val <= qmax)
+
+                    expected_quant_val = numpy.asarray((data_float[index] / scale).round() + zero_point).astype(
+                        np_int_type
+                    )
+                    numpy.clip(expected_quant_val, qmin, qmax, out=expected_quant_val)
+
+                    self.assertEqual(numpy.array(actual_quant_val), expected_quant_val)
+
 
 if __name__ == "__main__":
     unittest.main()

From b5eb9e8a8aeca7187f98706ec423d2e007ae604a Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 5 Jun 2024 18:25:23 -0700
Subject: [PATCH 20/26] [QNN EP] Update to QNN SDK 2.22 (#20628)

### Description
- Updates pipelines to use QNN SDK 2.22 by default.
- Linux QNN pipeline now uses an Ubuntu 22.04 image (required by QNN
SDK)
- Android QNN pipeline still uses the current Ubuntu 20.04 image. Will
update in a separate PR.
- Disables QDQ LayerNorm test that triggers QNN's graph finalization
error on QNN 2.22
- Increases accuracy tolerance for various HTP tests so that they pass
on Windows arm64.


### Motivation and Context
Test QNN EP with latest QNN SDK version by default.

---------

Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
---
 onnxruntime/test/onnx/TestCase.cc             |  5 +++++
 .../test/providers/cpu/math/matmul_test.cc    | 15 ++++---------
 .../test/providers/qnn/batch_norm_htp_test.cc | 10 ++++++---
 onnxruntime/test/providers/qnn/conv_test.cc   |  4 ++--
 .../test/providers/qnn/gemm_op_test.cc        |  9 +++++---
 .../test/providers/qnn/layer_norm_test.cc     | 15 ++++++++++++-
 onnxruntime/test/providers/qnn/lrn_op_test.cc |  8 +++----
 .../test/providers/qnn/matmul_test.cpp        | 10 +++------
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  4 ++--
 .../azure-pipelines/linux-qnn-ci-pipeline.yml |  4 ++--
 .../azure-pipelines/py-packaging-pipeline.yml |  2 +-
 .../qnn-ep-nuget-packaging-pipeline.yml       |  2 +-
 .../templates/jobs/download_linux_qnn_sdk.yml |  2 +-
 .../templates/jobs/download_win_qnn_sdk.yml   |  2 +-
 .../templates/py-packaging-stage.yml          |  2 +-
 .../templates/py-win-arm64-qnn.yml            |  2 +-
 .../templates/py-win-x64-qnn.yml              |  2 +-
 .../azure-pipelines/templates/qnn-ep-win.yml  |  2 +-
 .../win-qnn-arm64-ci-pipeline.yml             |  2 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   | 22 ++++++++++---------
 21 files changed, 71 insertions(+), 55 deletions(-)

diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index 1d54a3cfae9bf..6d3e9c2cb7865 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1381,6 +1381,11 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     // expected 13.5 (41580000), got 0 (0), diff: 13.5, tol=0.0145 idx=3. 3 of 4 differ
     broken_tests->insert({"averagepool_2d_ceil", "result differs"});
 #endif
+    // These next 3 Resize tests fail on CPU backend with QNN SDK 2.22.0 due to inaccuracy.
+    // output=Y:expected 1 (3f800000), got 3 (40400000), diff: 2, tol=0.002 idx=24. 8 of 56 differ
+    broken_tests->insert({"resize_upsample_sizes_nearest", "result differs"});
+    broken_tests->insert({"resize_upsample_sizes_nearest_axes_2_3", "result differs"});
+    broken_tests->insert({"resize_upsample_sizes_nearest_axes_3_2", "result differs"});
   }
 
 #ifdef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 24340e69c13c2..82f6914d08199 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -163,22 +163,15 @@ void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant
 
     // OpenVINO EP: Disabled temporarily matmul broadcasting not fully supported
     // Disable TensorRT because of unsupported data type
-    std::unordered_set<std::string> excluded_providers{kTensorrtExecutionProvider, kOpenVINOExecutionProvider};
+    // QNN EP: Crash during graph execution for QNN's CPU backend on QNN SDK 2.22. Not a problem for QNN's HTP backend.
+    std::unordered_set<std::string> excluded_providers{kTensorrtExecutionProvider,
+                                                       kOpenVINOExecutionProvider,
+                                                       kQnnExecutionProvider};
     if (t.name == "test 2D empty input") {
       // NNAPI: currently fails for the "test 2D empty input" case
       excluded_providers.insert(kNnapiExecutionProvider);
     }
 
-    if ("test padding and broadcast A > B" == t.name || "test 2D empty input" == t.name) {
-      // QNN can't handle 0 shap
-      excluded_providers.insert(kQnnExecutionProvider);
-    }
-#if defined(__linux__)
-    if (t.name == "test padding and broadcast B > A") {
-      // Accuracy error with QNN SDK 2.17.0 on CPU backend.
-      excluded_providers.insert(kQnnExecutionProvider);
-    }
-#endif
     test.ConfigExcludeEps(excluded_providers)
         .Config(run_with_tunable_op)
         .RunWithConfig();
diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
index 023a6078ff94d..036c5760ed560 100644
--- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
@@ -158,7 +158,8 @@ GetTestQDQModelFn<InputQType> BuildQDQBatchNormTestCase(const TestInputDef<float
 static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
                                 const TestInputDef<float>& scale_def,
                                 const TestInputDef<float>& bias_def,
-                                ExpectedEPNodeAssignment expected_ep_assignment) {
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                QDQTolerance tolerance = QDQTolerance()) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -171,7 +172,8 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
                        BuildQDQBatchNormTestCase<uint8_t, uint8_t, uint8_t>(input_def, scale_def, bias_def),
                        provider_options,
                        11,
-                       expected_ep_assignment);
+                       expected_ep_assignment,
+                       tolerance);
 }
 
 static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
@@ -219,7 +221,9 @@ TEST_F(QnnHTPBackendTests, BatchNorm2D) {
   RunBatchNormQDQTest(TestInputDef<float>({2, num_channels, 2, 2}, false, input_data),  // Input data
                       TestInputDef<float>({num_channels}, true, {1.0f, 2.0f}),          // Scale initializer
                       TestInputDef<float>({num_channels}, true, {1.1f, 2.1f}),          // Bias initializer
-                      ExpectedEPNodeAssignment::All);
+                      ExpectedEPNodeAssignment::All,
+                      // Require a slightly increased tolerance on Windows ARM64 (from 0.4% to 0.6%).
+                      QDQTolerance(0.006f));
 }
 
 // Test FP16 BatchNormalization on the HTP backend.
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index a469cccbbd447..b88578a915204 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -1626,8 +1626,8 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input1_padding_bias_initializer) {
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
                                      13,     // opset
-                                     // Need tolerance of 0.73% of output range after QNN SDK 2.17
-                                     QDQTolerance(0.00730f));
+                                     // Need tolerance of 0.76% of output range after QNN SDK 2.19.2
+                                     QDQTolerance(0.0076f));
 }
 
 TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input2_bias_initializer) {
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 959d637753623..33c868694c9c0 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -285,7 +285,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicInputs) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        QDQTolerance(0.00410f));
+                                        // Require tolerance of 0.74% on Windows ARM64.
+                                        QDQTolerance(0.0074f));
 }
 
 TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) {
@@ -304,7 +305,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        QDQTolerance(0.00410f));
+                                        // Require tolerance of 0.74% on Windows ARM64.
+                                        QDQTolerance(0.0074f));
 }
 
 TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) {
@@ -323,7 +325,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        QDQTolerance(0.00410f));
+                                        // Require tolerance of 0.74% on Windows ARM64.
+                                        QDQTolerance(0.0074f));
 }
 
 // Test 16-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index 8cebdd813dacd..7d129dceca582 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -158,7 +158,20 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
 }
 
 // Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input.
-TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_DynamicScale) {
+//
+// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22.
+// Verbose logs:
+// Starting stage: Graph Transformations and Optimizations
+// C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::flat_to_vtcm
+// C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x102800000013 preparation failed with err:-1
+// Completed stage: Graph Transformations and Optimizations (6247 us)
+// QnnDsp <E> "node_token_15" generated: could not create op
+// QnnDsp <E> RouterWindows graph prepare failed 12
+// QnnDsp <E> Failed to finalize graph (id: 1) with err 1002
+// QnnDsp <V> Wake up free backend 1 thread(s)
+// QnnDsp <I> QnnGraph_finalize done. status 0x3ea
+// Failed to finalize QNN graph.
+TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_DynamicScale) {
   RunLayerNormQDQTest<uint8_t, uint8_t>(TestInputDef<float>({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                                         TestInputDef<float>({3}, false, GetFloatDataInRange(0.0f, 1.0f, 3)),  // Dynamic
                                         {utils::MakeAttribute("axis", static_cast<int64_t>(-1))},             // Last axis
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index 751db5049f6b9..a99cba66bf167 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -135,8 +135,8 @@ TEST_F(QnnHTPBackendTests, LRNSize3) {
                            0.75f,    // beta
                            1.0f,     // bias
                            13,       // opset
-                           // Need to use tolerance of 0.405% of output range after QNN SDK 2.17
-                           QDQTolerance(0.00405f));
+                           // Need to use tolerance of 0.8% of output range after QNN SDK 2.22
+                           QDQTolerance(0.008f));
 }
 
 TEST_F(QnnHTPBackendTests, LRNSize5) {
@@ -147,8 +147,8 @@ TEST_F(QnnHTPBackendTests, LRNSize5) {
                            0.75f,    // beta
                            1.0f,     // bias
                            13,       // opset
-                           // Need to use tolerance of 0.407% of output range after QNN SDK 2.17
-                           QDQTolerance(0.00407f));
+                           // Need to use tolerance of 0.8% of output range after QNN SDK 2.22
+                           QDQTolerance(0.008f));
 }
 
 TEST_F(QnnHTPBackendTests, LRN_size_larger_than_channel) {
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index f26af7c79fdd9..dba60b1041696 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -103,7 +103,8 @@ static void RunQDQMatMulOpOpTest(const TestInputDef<float>& input1_def,
 // CPU tests:
 //
 
-TEST_F(QnnCPUBackendTests, MatMulOp) {
+// TODO: Crashes during QNN CPU execution (QNN SDK 2.22)
+TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp) {
   RunMatMulOpOpTest(TestInputDef<float>({2, 3}, false, {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}),
                     TestInputDef<float>({3, 2}, false, {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}),
                     ExpectedEPNodeAssignment::All, 18);
@@ -126,13 +127,8 @@ TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp_Broadcast) {
                     ExpectedEPNodeAssignment::All, 18, 0.0004f);
 }
 
-#if defined(__linux__)
+// TODO: Crashes during QNN CPU execution (QNN SDK 2.22)
 TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp_PaddingAndBroadcast_BLargerThanA) {
-#else
-// TODO: When fixed, enable MathOpTest.MatMulFloatType from cpu/mat/matmul_test.cc
-// QNN SDK 2.17: Accuracy errors
-TEST_F(QnnCPUBackendTests, MatMulOp_PaddingAndBroadcast_BLargerThanA) {
-#endif
   std::vector<int64_t> input0_shape = {2, 3, 2};
   std::vector<int64_t> input1_shape = {3, 2, 2, 1};
   RunMatMulOpOpTest(TestInputDef<float>(input0_shape, false, GetSequentialFloatData(input0_shape)),
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index f488398293b7f..1703490992fb4 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 3dce851d0e2cd..1dd0b3a5b2b97 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -71,7 +71,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 resources:
   repositories:
@@ -743,4 +743,4 @@ stages:
       displayName: 'Publish Pipeline NuGet Artifact'
       inputs:
         artifactName: 'drop-signed-nuget-qnn'
-        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
\ No newline at end of file
+        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index 5fb3107ce5de7..a1339652a9495 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,11 +32,11 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 jobs:
   - job: Build_QNN_EP
-    pool: onnxruntime-qnn-ubuntu-2004-cpu
+    pool: onnxruntime-qnn-ubuntu-2204-cpu
     timeoutInMinutes: 60
     workspace:
       clean: all
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 1273194753ce2..c1fde9eff69b0 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 trigger: none
 
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 22169ea5463f5..e27a3bcda16c3 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
index 232ba23c7bebb..236998407ad16 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.21.0.240401'
+    default: '2.22.0.240425'
 
 steps:
   - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
index c6db7bdb449e2..0f43dfc497dff 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.21.0.240401'
+    default: '2.22.0.240425'
 
 steps:
   - powershell: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 8ec1cff19e423..f2bd0e6f169e9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -60,7 +60,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index 4a695e1f3c43d..32fdf4819bd88 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 - name: PYTHON_VERSION
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index dfebf17d95aa2..668e51c828dcd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index e30a3f5ba2d8d..f75bb89b9ad48 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -1,5 +1,5 @@
 parameters:
-  QnnSdk: '2.21.0.240401'
+  QnnSdk: '2.22.0.240425'
   build_config: 'RelWithDebInfo'  
   IsReleaseBuild: false
   DoEsrp: false
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index a32f2a8a27660..0053a4a64ee02 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index 165c01767964f..ede7b3d336768 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.21.0.240401
+  default: 2.22.0.240425
 
 jobs:
 - job: 'build'
@@ -90,12 +90,14 @@ jobs:
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
     displayName: 'Run unit tests'
 
-  - script: |
-      .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
-    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
-    displayName: 'Run ONNX Tests'
-
-  - script: |
-      .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
-    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
-    displayName: 'Run float32 model tests'
+  # Comment out QnnCpu tests because QNN SDK 2.22 CPU backend crashes when executing MatMuls.
+  # Does not happen with HTP backend.
+  # - script: |
+  #    .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
+  #    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
+  #    displayName: 'Run ONNX Tests'
+  #
+  # - script: |
+  #    .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
+  #    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
+  #    displayName: 'Run float32 model tests'

From eb2ec667166a4b4a202cd30ebdb5e147b2013350 Mon Sep 17 00:00:00 2001
From: Chester Liu <4710575+skyline75489@users.noreply.github.com>
Date: Thu, 6 Jun 2024 11:19:09 +0800
Subject: [PATCH 21/26] Initialize device_id in cuda_call & rocm_call (#20933)

### Description
<!-- Describe your changes. -->

Initialize `device_id` with `-1` in  `cuda_call` and `rocm_call`.

### Motivation and Context

From PyTorch code:
https://github.com/pytorch/pytorch/blob/bb2de3b10120f91afce8da6233094076713f673d/c10/cuda/CUDAFunctions.cpp#L217-L324

If `cudaGetDevice` or `hipGetDevice` failed, an uninitialized `int`
would produce a random number that changes during each run:

```text
[with ERRTYPE = hipError_t; bool THRW = true; std::conditional_t<THRW, void, common::Status> = void] HIP failure 101: invalid device ordinal ; GPU=32741 ; hostname=e6724be2a31a ; file=/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h ; line=66 ; expr=hipGetDeviceProperties(&deviceProp, 0);
```

Notice the `GPU` value above. Using `-1` would clearly indicate such
failure and avoid confusion.
---
 onnxruntime/core/providers/cuda/cuda_call.cc | 2 +-
 onnxruntime/core/providers/rocm/rocm_call.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index f60684795a4bc..c73b23f3762ed 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -103,7 +103,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
       if (gethostname(hostname, HOST_NAME_MAX) != 0)
         strcpy(hostname, "?");
 #endif
-      int currentCudaDevice;
+      int currentCudaDevice = -1;
       cudaGetDevice(&currentCudaDevice);
       cudaGetLastError();  // clear last CUDA error
       static char str[1024];
diff --git a/onnxruntime/core/providers/rocm/rocm_call.cc b/onnxruntime/core/providers/rocm/rocm_call.cc
index 484e59f4de7d8..7974053c32497 100644
--- a/onnxruntime/core/providers/rocm/rocm_call.cc
+++ b/onnxruntime/core/providers/rocm/rocm_call.cc
@@ -104,7 +104,7 @@ std::conditional_t<THRW, void, Status> RocmCall(
       if (gethostname(hostname, HOST_NAME_MAX) != 0)
         strcpy(hostname, "?");
 #endif
-      int currentHipDevice;
+      int currentHipDevice = -1;
       ORT_IGNORE_RETURN_VALUE(hipGetDevice(&currentHipDevice));  // void to silence nodiscard
       ORT_IGNORE_RETURN_VALUE(hipGetLastError());                // clear last ROCM error; void to silence nodiscard
       static char str[1024];

From 3ecf48e3b5ea63a0a7a24e13fc5da98edd5b0b68 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 6 Jun 2024 15:21:34 +1000
Subject: [PATCH 22/26] Add support for Trilu<bool>.  (#20917)

### Description
<!-- Describe your changes. -->
Trilu<bool> is used by phi-3 when exported with torch.onnx.export.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/OperatorKernels.md                       |   2 +-
 .../core/providers/cpu/tensor/trilu.cc        |   5 +-
 .../providers/cpu/tensor/trilu_op_test.cc     | 425 +++++-------------
 3 files changed, 118 insertions(+), 314 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 8092c26da651a..67bfe48327e14 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -421,7 +421,7 @@ Do not modify directly.*
 |Transpose|*in* data:**T**<br> *out* transposed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)|
 |||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Trilu|*in* input:**T**<br> *in* k:**tensor(int64)**<br> *out* output:**T**|14+|**T** = tensor(double), tensor(float), tensor(int64)|
+|Trilu|*in* input:**T**<br> *in* k:**tensor(int64)**<br> *out* output:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int64)|
 |Unique|*in* X:**T**<br> *out* Y:**T**<br> *out* indices:**tensor(int64)**<br> *out* inverse_indices:**tensor(int64)**<br> *out* counts:**tensor(int64)**|11+|**T** = tensor(double), tensor(float), tensor(int64), tensor(int8), tensor(string)|
 |Unsqueeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* expanded:**T**<br><br>or<br><br>*in* data:**T**<br> *out* expanded:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/onnxruntime/core/providers/cpu/tensor/trilu.cc b/onnxruntime/core/providers/cpu/tensor/trilu.cc
index 91e429ef60d91..017bbcd44904e 100644
--- a/onnxruntime/core/providers/cpu/tensor/trilu.cc
+++ b/onnxruntime/core/providers/cpu/tensor/trilu.cc
@@ -31,7 +31,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kOnnxDomain,
     14,
     kCpuExecutionProvider,
-    KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints<float, double, int64_t>()),
+    KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints<float, double, int64_t, bool>()),
     Trilu);
 
 template <typename T>
@@ -110,6 +110,9 @@ Status Trilu::Compute(OpKernelContext* ctx) const {
     case sizeof(double):
       status = TriluImpl<double>(X, Y, k_val, up);
       break;
+    case sizeof(bool):
+      status = TriluImpl<bool>(X, Y, k_val, up);
+      break;
     default:
       ORT_THROW("Unsupported input data type of ", data_type);
   }
diff --git a/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc b/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc
index f0b5d6afa9c7b..f1d1d94343e6f 100644
--- a/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc
@@ -62,63 +62,54 @@ TEST(TriluOpTest, two_by_two_long_lower) {
   test.Run();
 }
 
+TEST(TriluOpTest, two_by_two_bool_upper) {
+  OpTester test("Trilu", 14, kOnnxDomain);
+  int64_t up = 1;
+  test.AddAttribute("upper", up);
+  test.AddInput<bool>("X", {2, 2},
+                      {true, true,
+                       true, true});
+  test.AddOutput<bool>("Y", {2, 2},
+                       {true, true,
+                        false, true});
+  test.Run();
+}
+
+TEST(TriluOpTest, three_by_three_bool_lower) {
+  OpTester test("Trilu", 14, kOnnxDomain);
+  int64_t up = 0;
+  test.AddAttribute("upper", up);
+  test.AddInput<bool>("X", {3, 3},
+                      // include a couple of false values to check they are copied
+                      {true, true, true,
+                       true, false, true,
+                       true, true, false});
+  test.AddOutput<bool>("Y", {3, 3},
+                       {true, false, false,
+                        true, false, false,
+                        true, true, false});
+  test.Run();
+}
+
 TEST(TriluOpTest, three_dim_float_upper) {
   OpTester test("Trilu", 14, kOnnxDomain);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {1});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            0.f,
-                            1.f,
-                            5.f,
-                            8.f,
-                            0.f,
-                            0.f,
-                            2.f,
-                            4.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            3.f,
-                            0.f,
-                            6.f,
-                            2.f,
-                            1.f,
-                            0.f,
-                            0.f,
-                            5.f,
-                            8.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                        });
+                        {0.f, 1.f, 5.f, 8.f,
+                         0.f, 0.f, 2.f, 4.f,
+                         0.f, 0.f, 0.f, 3.f,
+
+                         0.f, 6.f, 2.f, 1.f,
+                         0.f, 0.f, 5.f, 8.f,
+                         0.f, 0.f, 0.f, 4.f});
   test.Run();
 }
 
@@ -127,60 +118,22 @@ TEST(TriluOpTest, three_dim_float_lower) {
   int64_t up = 0;
   test.AddAttribute("upper", up);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {1});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            4.f,
-                            1.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                            3.f,
-                            2.f,
-                            0.f,
-                            6.f,
-                            1.f,
-                            2.f,
-                            3.f,
-                            1.f,
-                            6.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                            1.f,
-                            5.f,
-                            0.f,
-                            4.f,
-                            3.f,
-                            2.f,
-                            4.f,
-                        });
+                        {4.f, 1.f, 0.f, 0.f,
+                         4.f, 3.f, 2.f, 0.f,
+                         6.f, 1.f, 2.f, 3.f,
+
+                         1.f, 6.f, 0.f, 0.f,
+                         4.f, 1.f, 5.f, 0.f,
+                         4.f, 3.f, 2.f, 4.f});
   test.Run();
 }
 
@@ -189,60 +142,22 @@ TEST(TriluOpTest, neg_k_float_upper) {
   int64_t up = 1;
   test.AddAttribute("upper", up);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {-1});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            4.f,
-                            1.f,
-                            5.f,
-                            8.f,
-                            4.f,
-                            3.f,
-                            2.f,
-                            4.f,
-                            0.f,
-                            1.f,
-                            2.f,
-                            3.f,
-                            1.f,
-                            6.f,
-                            2.f,
-                            1.f,
-                            4.f,
-                            1.f,
-                            5.f,
-                            8.f,
-                            0.f,
-                            3.f,
-                            2.f,
-                            4.f,
-                        });
+                        {4.f, 1.f, 5.f, 8.f,
+                         4.f, 3.f, 2.f, 4.f,
+                         0.f, 1.f, 2.f, 3.f,
+
+                         1.f, 6.f, 2.f, 1.f,
+                         4.f, 1.f, 5.f, 8.f,
+                         0.f, 3.f, 2.f, 4.f});
   test.Run();
 }
 
@@ -251,120 +166,44 @@ TEST(TriluOpTest, neg_k_float_lower) {
   int64_t up = 0;
   test.AddAttribute("upper", up);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {-1});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            6.f,
-                            1.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            4.f,
-                            3.f,
-                            0.f,
-                            0.f,
-                        });
+                        {0.f, 0.f, 0.f, 0.f,
+                         4.f, 0.f, 0.f, 0.f,
+                         6.f, 1.f, 0.f, 0.f,
+
+                         0.f, 0.f, 0.f, 0.f,
+                         4.f, 0.f, 0.f, 0.f,
+                         4.f, 3.f, 0.f, 0.f});
   test.Run();
 }
 
 TEST(TriluTest, small_k_float_upper) {
   OpTester test("Trilu", 14, kOnnxDomain);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {-5});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            4.f,
-                            1.f,
-                            5.f,
-                            8.f,
-                            4.f,
-                            3.f,
-                            2.f,
-                            4.f,
-                            6.f,
-                            1.f,
-                            2.f,
-                            3.f,
-                            1.f,
-                            6.f,
-                            2.f,
-                            1.f,
-                            4.f,
-                            1.f,
-                            5.f,
-                            8.f,
-                            4.f,
-                            3.f,
-                            2.f,
-                            4.f,
-                        });
+                        {4.f, 1.f, 5.f, 8.f,
+                         4.f, 3.f, 2.f, 4.f,
+                         6.f, 1.f, 2.f, 3.f,
+
+                         1.f, 6.f, 2.f, 1.f,
+                         4.f, 1.f, 5.f, 8.f,
+                         4.f, 3.f, 2.f, 4.f});
   test.Run();
 }
 
@@ -373,60 +212,22 @@ TEST(TriluOpTest, small_k_float_lower) {
   int64_t up = 0;
   test.AddAttribute("upper", up);
   test.AddInput<float>("X", {2, 3, 4},
-                       {
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                           6.f,
-                           1.f,
-                           2.f,
-                           3.f,
-                           1.f,
-                           6.f,
-                           2.f,
-                           1.f,
-                           4.f,
-                           1.f,
-                           5.f,
-                           8.f,
-                           4.f,
-                           3.f,
-                           2.f,
-                           4.f,
-                       });
+                       {4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f,
+                        6.f, 1.f, 2.f, 3.f,
+
+                        1.f, 6.f, 2.f, 1.f,
+                        4.f, 1.f, 5.f, 8.f,
+                        4.f, 3.f, 2.f, 4.f});
   test.AddInput<int64_t>("k", {1}, {-5});
   test.AddOutput<float>("Y", {2, 3, 4},
-                        {
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                            0.f,
-                        });
+                        {0.f, 0.f, 0.f, 0.f,
+                         0.f, 0.f, 0.f, 0.f,
+                         0.f, 0.f, 0.f, 0.f,
+
+                         0.f, 0.f, 0.f, 0.f,
+                         0.f, 0.f, 0.f, 0.f,
+                         0.f, 0.f, 0.f, 0.f});
   test.Run();
 }
 

From 5b87544aab7fecd2801f7858ea227fab35162e4d Mon Sep 17 00:00:00 2001
From: Chester Liu <4710575+skyline75489@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:10:14 +0800
Subject: [PATCH 23/26] Add conditional check in Get/Set current GPU device id
 (#20932)

### Description

Add conditional check in Get/Set current GPU device id


### Motivation and Context

Currently with ROCm build, calling `GetCurrentGpuDeviceId` will still
try to find CUDA libraries and log the following error message:

```text
[E:onnxruntime:, provider_bridge_ort.cc:1836 TryGetProviderInfo_CUDA] /onnxruntime_src/onnxruntime/core/session/provider_bridge_ort.cc:1511 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libonnxruntime_providers_cuda.so: cannot open shared object file: No such file or directory
```

This is unnecessary and confusing.
---
 .../core/session/provider_bridge_ort.cc        | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d18b3ac40d489..7f7ed5e436afe 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2099,22 +2099,36 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessi
   return OrtApis::SessionOptionsAppendExecutionProvider_CUDA(options, &provider_options);
 }
 
-ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) {
+ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, [[maybe_unused]] _In_ int device_id) {
   API_IMPL_BEGIN
+
+#ifdef USE_CUDA
   if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->SetCurrentGpuDeviceId(device_id);
+#endif
+
+#ifdef USE_ROCM
   if (auto* info = onnxruntime::TryGetProviderInfo_ROCM())
     return info->SetCurrentGpuDeviceId(device_id);
+#endif
+
   return CreateStatus(ORT_FAIL, "CUDA and/or ROCM execution provider is either not enabled or not available.");
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) {
+ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, [[maybe_unused]] _In_ int* device_id) {
   API_IMPL_BEGIN
+
+#ifdef USE_CUDA
   if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->GetCurrentGpuDeviceId(device_id);
+#endif
+
+#ifdef USE_ROCM
   if (auto* info = onnxruntime::TryGetProviderInfo_ROCM())
     return info->GetCurrentGpuDeviceId(device_id);
+#endif
+
   return CreateStatus(ORT_FAIL, "CUDA and/or ROCM execution provider is either not enabled or not available.");
   API_IMPL_END
 }

From c749bd997a02c7b49cbdb9569f0286041d19db08 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Thu, 6 Jun 2024 08:21:33 -0700
Subject: [PATCH 24/26] webgpu quickgelu (#20939)

---
 js/web/docs/webgpu-operators.md               |  1 +
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  1 +
 js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts   | 28 +++++++++++
 js/web/test/data/ops/quick-gelu.jsonc         | 46 +++++++++++++++++++
 .../contrib_ops/js/js_contrib_kernels.cc      |  2 +
 onnxruntime/contrib_ops/js/quick_gelu.cc      | 23 ++++++++++
 onnxruntime/contrib_ops/js/quick_gelu.h       | 24 ++++++++++
 7 files changed, 125 insertions(+)
 create mode 100644 js/web/test/data/ops/quick-gelu.jsonc
 create mode 100644 onnxruntime/contrib_ops/js/quick_gelu.cc
 create mode 100644 onnxruntime/contrib_ops/js/quick_gelu.h

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 3af4942c2e4aa..919b005ec4c21 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -74,6 +74,7 @@ Do not modify directly.*
 | Not | ai.onnx(1+) |  |
 | Pad | ai.onnx(2-10,11-12,13-17,18,19+) |  |
 | Pow | ai.onnx(7-11,12,13-14,15+) |  |
+| QuickGelu | com.microsoft(1+) |  |
 | Range | ai.onnx(11+) |  |
 | Reciprocal | ai.onnx(6-12,13+) |  |
 | ReduceL1 | ai.onnx(1-10,11-12,13-17,18+) |  |
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 2d2f345d0c273..ce5b4455fde60 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -107,6 +107,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Not', [unaryOps.not]],
   ['Pad', [pad]],
   ['Pow', [binaryOps.pow]],
+  ['QuickGelu', [unaryOps.quickgelu, unaryOps.parseAlphaAttributes]],
   ['Range', [range]],
   ['Reciprocal', [unaryOps.reciprocal]],
   ['ReduceMin', [reduceMin]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index 5f105c745739e..12ba2a10cdf9f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -314,3 +314,31 @@ export const thresholdedRelu = (context: ComputeContext, attributes: AlphaAttrib
 export const log = (context: ComputeContext): void => {
   context.compute(createElementwiseProgramInfo(context.inputs[0], 'Log', 'log'));
 };
+
+export const quickGeluImpl = (varType: string, alpha: number) => `
+const alpha = vec4<${varType}>(${alpha});
+const one = ${varType}(1.0);
+const zero = ${varType}(0.0);
+
+fn quick_gelu_impl(x: vec4<${varType}>) -> vec4<${varType}> {
+  let v = x *alpha;
+  var x1 : vec4<${varType}>;
+  for (var i = 0; i < 4; i = i + 1) {
+    if (v[i] >= zero) {
+      x1[i] = one / (one + exp(-v[i]));
+    } else {
+      x1[i] = one - one / (one + exp(v[i]));
+    }
+  }
+  return x * x1;
+}
+`;
+
+export const quickGeluExpression = (x: string) => `quick_gelu_impl(${x})`;
+
+export const quickgelu = (context: ComputeContext, attributes: AlphaAttributes): void => {
+  const dType = tensorTypeToWsglValueType(context.inputs[0].dataType);
+  context.compute(createElementwiseProgramInfo(
+      context.inputs[0], 'QuickGelu', quickGeluExpression, quickGeluImpl(dType, attributes.alpha), attributes.cacheKey,
+      context.inputs[0].dataType));
+};
diff --git a/js/web/test/data/ops/quick-gelu.jsonc b/js/web/test/data/ops/quick-gelu.jsonc
new file mode 100644
index 0000000000000..a6e618fe34796
--- /dev/null
+++ b/js/web/test/data/ops/quick-gelu.jsonc
@@ -0,0 +1,46 @@
+[
+  {
+    "name": "QuickGelu test",
+    "operator": "QuickGelu",
+    "opset": { "domain": "com.microsoft", "version": 1 },
+    "cases": [
+      {
+        "name": "[2x4]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, -0.8],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0542447, 0.116857, 0.187484, 0.265566, 0.350388, 0.441123, 0.53689, 0.636815],
+            "dims": [2, 4],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "[3x5]",
+        "inputs": [
+          {
+            "data": [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5, 1.1, 1.2, 1.3, 1.4, -1.5],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              0.0542447, 0.116857, 0.187484, 0.265566, 0.350388, 0.845795, 1.9356, 2.98192, 3.99558, 4.99899, 0.953383,
+              1.0622, 1.17178, 1.2817, 1.39166
+            ],
+            "dims": [3, 5],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
index 9d8f79c67d8a4..7bc3414c89978 100644
--- a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc
@@ -16,6 +16,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, GroupQueryAttention);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, QuickGelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, RotaryEmbedding);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, SkipLayerNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, SimplifiedLayerNormalization);
@@ -38,6 +39,7 @@ Status RegisterJsContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, GroupQueryAttention)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, QuickGelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, RotaryEmbedding)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1,
                                                             SkipLayerNormalization)>,
diff --git a/onnxruntime/contrib_ops/js/quick_gelu.cc b/onnxruntime/contrib_ops/js/quick_gelu.cc
new file mode 100644
index 0000000000000..4bb4d5afd4109
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quick_gelu.cc
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quick_gelu.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsepSupportedFloatTypes;
+
+ONNX_OPERATOR_KERNEL_EX(
+    QuickGelu,
+    kMSDomain,
+    1,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", JsepSupportedFloatTypes()),
+    QuickGelu);
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/js/quick_gelu.h b/onnxruntime/contrib_ops/js/quick_gelu.h
new file mode 100644
index 0000000000000..51e39e2718d51
--- /dev/null
+++ b/onnxruntime/contrib_ops/js/quick_gelu.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace js {
+
+using onnxruntime::js::JsKernel;
+
+class QuickGelu final : public JsKernel {
+ public:
+  explicit QuickGelu(const OpKernelInfo& info) : JsKernel(info) {
+    float alpha = info.GetAttrOrDefault<float>("alpha", 1.0);
+    JSEP_INIT_KERNEL_ATTRIBUTE(QuickGelu, ({"alpha" : $1}), alpha);
+  }
+};
+
+}  // namespace js
+}  // namespace contrib
+}  // namespace onnxruntime

From da1f8f927484e3fb326bdc10eb2f5f8f028e07e2 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 6 Jun 2024 23:22:18 +0800
Subject: [PATCH 25/26] [WebNN EP] TFLite backend only supports limit ranges
 for Clip (#20863)

---
 js/web/docs/webnn-operators.md                |  2 +-
 .../webnn/builders/impl/clip_op_builder.cc    | 26 ++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 1df40b71a00fa..19e1fcb8fd3af 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -19,7 +19,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✗ | ✓ | Only supports 'training_mode' value is 0, one output |
 | Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✗ | ✓ | |
 | Ceil | ai.onnx(7-12, 13+) | ceil | ✓ | ✓ | |
-| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | |
+| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0] (Chromium issue: https://issues.chromium.org/issues/326156496) |
 | Concat | ai.onnx(7-10, 11-12, 13+) | concat | ✓ | ✓ | |
 | Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU requires the 'W' (weight) input to be a constant |
 | ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✗ | Only supports 3-D or 4-D input and 'W' (weight). |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index 30848b666003d..e6403a4cd12dc 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -24,7 +24,7 @@ class ClipOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type,
                               const logging::Logger& logger) const override;
 };
@@ -64,13 +64,33 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                       const Node& node,
-                                      const WebnnDeviceType /* device_type */,
+                                      const WebnnDeviceType device_type,
                                       const logging::Logger& logger) const {
   // TODO: Update IsOpSupportedImpl to pass GraphViewer instead of InitializedTensorSet so the implementations
   // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP.
   // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger)
   float min, max;
-  return GetClipMinMax(initializers, node, min, max, logger);
+  if (GetClipMinMax(initializers, node, min, max, logger)) {
+    // WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0].
+    // TODO: Remove this workaround once the associated issue is resolved in Chromium:
+    // https://issues.chromium.org/issues/326156496.
+    if (device_type == WebnnDeviceType::CPU) {
+      if ((min == 0.0f && max == std::numeric_limits<float>::infinity()) ||
+          (min == -1.0f && max == 1.0f) ||
+          (min == 0.0f && max == 6.0f)) {
+        return true;
+      } else {
+        LOGS(logger, VERBOSE) << "Clip min and max values ("
+                              << min << ", "
+                              << max << ") are not supported for WebNN CPU backend";
+        return false;
+      }
+    }
+
+    return true;
+  } else {
+    return false;
+  };
 }
 
 bool ClipOpBuilder::HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type,

From 52874f628a14ce971470995fbe9c15512f40de5b Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 6 Jun 2024 23:22:41 +0800
Subject: [PATCH 26/26] [WebNN EP] Remove some constraints for CPU backend
 (#20900)

Following constraints have been supported by WebNN TFLite backend:
- Concat: supports up to 4 inputs
- Matmul: supports broadcasting
- Resize: supports nearest mode
- Split: supports up to 4 outputs
---
 js/web/docs/webnn-operators.md                |  6 +--
 .../webnn/builders/impl/concat_op_builder.cc  | 30 +-------------
 .../webnn/builders/impl/gemm_op_builder.cc    | 41 +++----------------
 .../webnn/builders/impl/resize_op_builder.cc  | 20 +++------
 .../webnn/builders/impl/split_op_builder.cc   | 12 +-----
 .../providers/webnn/builders/model_builder.h  |  2 +-
 6 files changed, 20 insertions(+), 91 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 19e1fcb8fd3af..966c93a85ae2a 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -50,7 +50,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✗ | ✓ | |
 | Log | ai.onnx(7-12, 13+) | log | ✗ | ✓ | |
 | LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
-| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | WebNN CPU doesn't support broadcasting for MatMul |
+| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | |
 | Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | |
 | MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
 | Min | ai.onnx(7, 8-11, 12, 13+) | min | ✓ | ✓ | |
@@ -73,7 +73,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✗ | ✓ | Input 'axes' if present should be a constant |
 | Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
 | Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, exclude_outside != 0, input 'scales' and 'sizes' if present must be a constant, WebNN CPU backend only supports 'linear' mode, WebNN GPU backend only supports 'linear' and 'nearest' modes |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, exclude_outside != 0, input 'scales' and 'sizes' if present must be a constant, 'linear' and 'nearest' modes |
 | Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
 | Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | |
 | Softplus | ai.onnx(7+) | softplus | ✗ | ✓ | |
@@ -81,7 +81,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | Sin | ai.onnx(7+) | sin | ✗ | ✓ | |
 | Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant, only supports 'steps' value 1 |
 | Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | Only supports input rank >= 2 |
-| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant, WebNN CPU backend only supports up to 4 outputs |
+| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant |
 | Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | |
 | Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | Input 'axes' if present should be a constant |
 | Sub | ai.onnx(7-12, 13, 14+) | sub | ✓ | ✓ | |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index d3fa00e5fe32b..e4f98b09e03c5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -36,40 +36,14 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   uint32_t axis = static_cast<uint32_t>(HandleNegativeAxis(helper.Get("axis", 1), rank));
 
-  const size_t num_inputs = input_defs.size();
   std::vector<emscripten::val> inputs;
   for (const auto* input : input_defs) {
     LOGS(logger, VERBOSE) << "input name " << input->Name();
     inputs.push_back(model_builder.GetOperand(input->Name()));
   }
 
-  emscripten::val output = emscripten::val::undefined();
-  if (num_inputs <= 4 || model_builder.GetPreferredLayout() == DataLayout::NCHW) {
-    output = model_builder.GetBuilder().call<emscripten::val>("concat", emscripten::val::array(inputs), axis);
-  } else {
-    // WebNN XNNPack backend only supports the concat with inputs number <= 4,
-    // decomposing the Concat with inputs number > 4 into multiple WebNN concat ops.
-    size_t remaining_inputs = num_inputs;
-    size_t max_inputs = 4;
-    while (remaining_inputs > 0) {
-      std::vector<emscripten::val> chunk_inputs;
-
-      // Push the last concated output to the next chunk_inputs.
-      if (output != emscripten::val::undefined()) {
-        chunk_inputs.push_back(output);
-        max_inputs = 3;
-      }
-
-      size_t chunk_size = std::min(remaining_inputs, max_inputs);
-
-      for (size_t i = 0; i < chunk_size; i++) {
-        chunk_inputs.push_back(inputs[num_inputs - remaining_inputs + i]);
-      }
-
-      output = model_builder.GetBuilder().call<emscripten::val>("concat", emscripten::val::array(chunk_inputs), axis);
-      remaining_inputs -= chunk_size;
-    }
-  }
+  emscripten::val output =
+      model_builder.GetBuilder().call<emscripten::val>("concat", emscripten::val::array(inputs), axis);
 
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 248463f473b2e..53f885019ab2f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -23,7 +23,7 @@ class GemmOpBuilder : public BaseOpBuilder {
 
   // Operator support related.
  private:
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType /* device_type */,
                               const logging::Logger& logger) const override;
@@ -64,13 +64,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       b = model_builder.GetBuilder().call<emscripten::val>("reshape", b,
                                                            emscripten::val::array(GetVecUint32FromVecInt64(b_shape)));
     }
-    // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case.
-    // TODO: Remove this workaround when it is fixed in Chromium.
-    if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) {
-      output = model_builder.GetBuilder().call<emscripten::val>("gemm", a, b);
-    } else {
-      output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
-    }
+
+    output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+
     // If the inputs are both 1D， reduce the output to a scalar.
     if (extended_a_shape && extended_b_shape) {
       output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array());
@@ -132,11 +128,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
 // Operator support related.
 
-bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
                                       const Node& node,
-                                      const WebnnDeviceType device_type,
+                                      const WebnnDeviceType /* device_type */,
                                       const logging::Logger& logger) const {
-  (void)initializers;
   const auto& op_type = node.OpType();
   const auto& input_defs(node.InputDefs());
   const size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
@@ -194,30 +189,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     }
   }
 
-  if (op_type == "MatMul") {
-    // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions.
-    // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions.
-    if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1);
-    if (b_shape.size() == 1) b_shape.push_back(1);
-
-    // WebNN CPU backend has two more constraints.
-    // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177
-    // TODO: Remove this workaround when Chromium enables broadcast for MatMul on WebNN CPU backend.
-    if (device_type == WebnnDeviceType::CPU) {
-      if (a_shape.size() != b_shape.size()) {
-        LOGS(logger, VERBOSE) << "The rank of two inputs for WebNN CPU backend MatMul must be the same.";
-        return false;
-      }
-
-      for (size_t i = 0; i < a_shape.size() - 2; i++) {
-        if (a_shape[i] != b_shape[i]) {
-          LOGS(logger, VERBOSE) << "WebNN CPU backend can't support broadcasting for MatMul.";
-          return false;
-        }
-      }
-    }
-  }
-
   return true;
 }
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index ea54b70a66677..c4ca980fec715 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
   // We only support Resize opset 11+ here.
@@ -164,7 +164,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
-                                        const WebnnDeviceType device_type,
+                                        const WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -184,18 +184,10 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
-    // WebNN CPU backend only supports "linear" mode.
-    // WebNN GPU backend only supports "linear" and "nearest" modes.
-    if (device_type == WebnnDeviceType::CPU) {
-      if (!is_linear_resize) {
-        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend.";
-        return false;
-      }
-    } else {
-      if (!is_linear_resize && !is_nearest_resize) {
-        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend.";
-        return false;
-      }
+    // WebNN only supports "linear" and "nearest" modes.
+    if (!is_linear_resize && !is_nearest_resize) {
+      LOGS(logger, VERBOSE) << "Resize does not support input mode: " << mode;
+      return false;
     }
 
     const auto exclude_outside = helper.Get("exclude_outside", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index c50b678bf2386..ea3b8ef384ddc 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -27,7 +27,7 @@ class SplitOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -94,7 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                        const Node& node,
-                                       const WebnnDeviceType device_type,
+                                       const WebnnDeviceType /* device_type */,
                                        const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
   std::vector<int64_t> input_shape;
@@ -126,10 +126,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
       LOGS(logger, VERBOSE) << "Cannot get split.";
       return false;
     }
-    if (split.size() > 4 && device_type == WebnnDeviceType::CPU) {
-      LOGS(logger, VERBOSE) << "WebNN CPU backend only supports up to 4 outputs.";
-      return false;
-    }
   } else {
     if (helper.HasAttr("num_outputs")) {
       // Split has 'num_outputs' attribute when opset is 18.
@@ -138,10 +134,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
         LOGS(logger, VERBOSE) << "The 'num_outputs' must be a positive integer.";
         return false;
       }
-      if (num_outputs > 4 && device_type == WebnnDeviceType::CPU) {
-        LOGS(logger, VERBOSE) << "WebNN CPU backend only supports up to 4 outputs.";
-        return false;
-      }
     } else {
       const auto opset = node.SinceVersion();
       if (opset >= 18) {
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index 8c1848eb833c1..80077b3abe56d 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -53,7 +53,7 @@ class ModelBuilder {
   void AddInitializerToSkip(const std::string& tensor_name);
 
   // There are some input which will not be used, add it to a list which will not
-  // be added to CoreML model, since CoreML does not like input unused.
+  // be added to WebNN model, since WebNN does not like input unused.
   void AddInputToSkip(const std::string& input_name);
 
   std::string GetUniqueName(const std::string& base_name);