From 038c76378fdee45261d43af45466a0797e6ad124 Mon Sep 17 00:00:00 2001
From: Pranav Sharma <prs@microsoft.com>
Date: Thu, 21 Sep 2023 00:08:10 -0700
Subject: [PATCH 01/14] Include onnxruntime_float16.h in the package. (#17637)

### Description
Include onnxruntime_float16.h in the package.

### Motivation and Context
This was missed in the recently released 1.16 pkgs (except Nuget).
---
 tools/ci_build/github/linux/copy_strip_binary.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index b875a3937aaa9..63690b69fc91a 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -48,6 +48,7 @@ fi
 cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_c_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_api.h  $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_cxx_inline.h  $BINARY_DIR/$ARTIFACT_NAME/include
+cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_float16.h  $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include
 cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h  $BINARY_DIR/$ARTIFACT_NAME/include

From 57dfd15d7bc9d9c5779896f6685ec473875dc6e1 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 21 Sep 2023 07:33:29 -0700
Subject: [PATCH 02/14] Remove dnf update from docker build scripts (#17551)

### Description
1. Remove 'dnf update' from docker build scripts, because it upgrades TRT
packages from CUDA 11.x to CUDA 12.x.
To reproduce it, you can run the following commands in a CentOS CUDA
11.x docker image such as nvidia/cuda:11.8.0-cudnn8-devel-ubi8.
```
export v=8.6.1.6-1.cuda11.8
dnf  install -y libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}        libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v}  libnvinfer-headers-plugin-devel-${v}
dnf update -y
```
The last command will generate the following outputs:
```
========================================================================================================================
 Package                                     Architecture       Version                          Repository        Size
========================================================================================================================
Upgrading:
 libnvinfer-devel                            x86_64             8.6.1.6-1.cuda12.0               cuda             542 M
 libnvinfer-headers-devel                    x86_64             8.6.1.6-1.cuda12.0               cuda             118 k
 libnvinfer-headers-plugin-devel             x86_64             8.6.1.6-1.cuda12.0               cuda              14 k
 libnvinfer-plugin-devel                     x86_64             8.6.1.6-1.cuda12.0               cuda              13 M
 libnvinfer-plugin8                          x86_64             8.6.1.6-1.cuda12.0               cuda              13 M
 libnvinfer-vc-plugin-devel                  x86_64             8.6.1.6-1.cuda12.0               cuda             107 k
 libnvinfer-vc-plugin8                       x86_64             8.6.1.6-1.cuda12.0               cuda             251 k
 libnvinfer8                                 x86_64             8.6.1.6-1.cuda12.0               cuda             543 M
 libnvonnxparsers-devel                      x86_64             8.6.1.6-1.cuda12.0               cuda             467 k
 libnvonnxparsers8                           x86_64             8.6.1.6-1.cuda12.0               cuda             757 k
 libnvparsers-devel                          x86_64             8.6.1.6-1.cuda12.0               cuda             2.0 M
 libnvparsers8                               x86_64             8.6.1.6-1.cuda12.0               cuda             854 k
Installing dependencies:
 cuda-toolkit-12-0-config-common             noarch             12.0.146-1                       cuda             7.7 k
 cuda-toolkit-12-config-common               noarch             12.2.140-1                       cuda             7.9 k
 libcublas-12-0                              x86_64             12.0.2.224-1                     cuda             361 M
 libcublas-devel-12-0                        x86_64             12.0.2.224-1                     cuda             397 M

Transaction Summary
========================================================================================================================

```
As you can see from the output,  they are CUDA 12 packages.

The problem can also be solved by lock the packages' versions by using
"dnf versionlock" command right after installing the CUDA/TRT packages.
However, going forward, to get the better reproducibility, I suggest
manually fix dnf package versions in the installation scripts like we do
for TRT now.

```bash
v="8.6.1.6-1.cuda11.8" &&\
    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo &&\
    yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}\
        libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v}  libnvinfer-headers-plugin-devel-${v}
```
When we have a need to upgrade a package due to security alert or some
other reasons, we manually change the version string instead of relying
on "dnf update". Though this approach increases efforts, it can make our
pipeines more stable.

2. Move python test to docker
### Motivation and Context
Right now the nightly gpu package mixes using CUDA 11.x and CUDA 12.x
and the result package is totally not usable(crashes every time)
---
 .../azure-pipelines/linux-ci-pipeline.yml     |   7 +-
 .../py-package-test-pipeline.yml              |  37 +++---
 .../templates/c-api-linux-cpu.yml             |   2 +-
 .../templates/py-package-smoking-test.yml     |  28 ++---
 .../templates/py-packaging-linux-test-cpu.yml | 117 ++++++++++++++++++
 .../py-packaging-linux-test-cuda.yml          |  98 +++++++++++++++
 .../templates/py-packaging-linux-test.yml     |  85 -------------
 .../linux/docker/Dockerfile.manylinux2_28_cpu |   9 +-
 .../docker/Dockerfile.manylinux2_28_cuda11    |   5 +-
 ...kerfile.manylinux2_28_cuda11_6_tensorrt8_4 |   5 +-
 ...kerfile.manylinux2_28_cuda11_6_tensorrt8_5 |   5 +-
 ...kerfile.manylinux2_28_cuda11_8_tensorrt8_6 |   5 +-
 ...Dockerfile.manylinux2_28_training_cuda11_8 |   3 -
 ...erfile.package_ubuntu_cuda11_8_tensorrt8_6 |  20 +--
 .../default/cpu/scripts/install_centos.sh     |   7 +-
 .../default/cpu/scripts/install_deps.sh       |  24 ++--
 .../inference/x64/default/cpu/Dockerfile      |   4 +-
 .../x64/default/cpu/scripts/install_centos.sh |   8 +-
 .../inference/x64/default/gpu/Dockerfile      |   2 +
 .../x64/default/gpu/scripts/install_centos.sh |   8 +-
 .../python/cpu/Dockerfile.manylinux2_28_cpu   |   3 -
 .../x64/python/cpu/scripts/install_centos.sh  |   6 +-
 .../github/linux/docker/manylinux.patch       |   9 +-
 .../linux/docker/scripts/install_dotnet.sh    |  10 +-
 .../scripts/manylinux/install_centos.sh       |   9 +-
 .../docker/scripts/manylinux/install_deps.sh  |  26 ++--
 .../scripts/manylinux/install_deps_aten.sh    |   2 +-
 .../scripts/manylinux/install_deps_eager.sh   |   2 +-
 .../github/linux/run_python_dockertest.sh     |  29 +++++
 .../ci_build/github/linux/run_python_tests.sh |  20 ++-
 tools/scripts/python_test.sh                  |   0
 tools/scripts/symbolic_shape_infer_test.sh    |   0
 32 files changed, 351 insertions(+), 244 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
 delete mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml
 create mode 100755 tools/ci_build/github/linux/run_python_dockertest.sh
 mode change 100644 => 100755 tools/scripts/python_test.sh
 mode change 100644 => 100755 tools/scripts/symbolic_shape_infer_test.sh

diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 21bc1c481b3e6..33fc9d94bac09 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -200,8 +200,11 @@ stages:
 - stage: arm64_test
   dependsOn: ['arm64_build']
   jobs:
-  - template: templates/py-packaging-linux-test.yml
+  - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'aarch64'
       machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
-      device: 'CPU'
+      base_image: 'arm64v8/almalinux:8'
+      devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
+      ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
+      prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index c684e08ba1258..2161a9205f22d 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -3,24 +3,38 @@ resources:
   - pipeline: build
     source: 'Python packaging pipeline'
     trigger: true
+    branch: main # branch to pick the artifact, Used only for manual triggered pipeline runs for testing the pipeline itself
+  #TODO: Remove the following dependency. Running python tests should not need to use manylinux.
+  repositories:
+  - repository: manylinux # The name used to reference this repository in the checkout step
+    type: Github
+    endpoint: Microsoft
+    name: pypa/manylinux
+    ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 stages:
 - stage: Linux_Test_CPU_x86_64_stage
   jobs:
-  - template: templates/py-packaging-linux-test.yml
+  - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'x86_64'
       machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
-      device: 'CPU'
+      base_image: 'registry.access.redhat.com/ubi8/ubi'
+      devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
+      ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
+      prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
 
 - stage: Linux_Test_CPU_aarch64_stage
   dependsOn: []
   jobs:
-  - template: templates/py-packaging-linux-test.yml
+  - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'aarch64'
       machine_pool: 'aiinfra-linux-ARM64-CPU-2019'
-      device: 'CPU'
+      base_image: 'arm64v8/almalinux:8'
+      devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
+      ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
+      prepend_path: '/opt/rh/gcc-toolset-12/root/usr/bin:'
 
 - stage: Packages_Somking_Test
   dependsOn: []
@@ -31,19 +45,6 @@ stages:
         machine_pool:
           vmImage: 'macOS-13'
         itemPattern: '*/*mac*x86_64.whl'
-    - template: templates/py-package-smoking-test.yml
-      parameters:
-        job_name: Test_WIN_64_Wheels
-        itemPattern: '*/*win_amd64.whl'
-        machine_pool:
-          vmImage: 'windows-2022'
-    - template: templates/py-package-smoking-test.yml
-      parameters:
-        job_name: Test_WIN_32_Wheels
-        itemPattern: '*/*win32.whl'
-        python_arch: 'x86'
-        machine_pool:
-          vmImage: 'windows-2022'
     - template: templates/py-package-smoking-test.yml
       parameters:
         job_name: Test_LINUX_x86_64_Wheels
@@ -61,7 +62,7 @@ stages:
     - Linux_Test_CPU_aarch64_stage
     - Packages_Somking_Test
   jobs:
-  - template: templates/py-packaging-linux-test.yml
+  - template: templates/py-packaging-linux-test-cuda.yml
     parameters:
       arch: 'x86_64'
       machine_pool: 'Onnxruntime-Linux-GPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 796938dc22a67..15fcec0511741 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -68,7 +68,7 @@ jobs:
         script: |
           mkdir -p $HOME/.onnx
           docker run --rm -e CFLAGS="${{parameters.OnnxruntimeCFlags}}" -e CXXFLAGS="${{parameters.OnnxruntimeCXXFlags}}" --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3 \
+          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \
           /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
           --skip_submodule_sync  --parallel --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}"
         workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index cee3bd9c9e968..8d5ca19a73535 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -39,36 +39,22 @@ jobs:
       versionSpec: $(PythonVersion)
       architecture: ${{ parameters.python_arch }}
 
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Artifact'
-    inputs:
-      artifactName: 'onnxruntime'
-      targetPath: '$(Build.BinariesDirectory)/whl'
-      itemPattern: ${{parameters.itemPattern}}
-      # The public ADO project
-      ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
-        buildType: current
-      # The private ADO project
-      ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
-        project: '530acbc4-21bc-487d-8cd8-348ff451d2ff'
-        definition: 841
-        preferTriggeringPipeline: true
-        runVersion: 'latest'
-        buildType: specific
+  - download: build   # pipeline resource identifier.
+    artifact: 'onnxruntime'
 
   - task: Bash@3
     inputs:
       targetType: 'inline'
       script: |
         set -ex
-        files=(whl/*.whl)
+        files=(*.whl)
         FILE_NAME="${files[0]}"
         FILE_NAME=$(basename $FILE_NAME)
         PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
-        python3 -m pip install --find-links "$(Build.BinariesDirectory)/whl" $PYTHON_PACKAGE_NAME
-        pip show $PYTHON_PACKAGE_NAME
-        python -c "import onnxruntime as ort; print(ort.__version__)"
-      workingDirectory: $(Build.BinariesDirectory)
+        python3 -m pip install --find-links "$(Pipeline.Workspace)/build/onnxruntime" $PYTHON_PACKAGE_NAME
+        python3 -m pip show $PYTHON_PACKAGE_NAME
+        python3 -c "import onnxruntime as ort; print(ort.__version__)"
+      workingDirectory: $(Pipeline.Workspace)/build/onnxruntime
     displayName: Test Package Installation
 
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
new file mode 100644
index 0000000000000..cc90085e184dc
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cpu.yml
@@ -0,0 +1,117 @@
+parameters:
+- name: arch
+  type: string
+
+- name: base_image
+  type: string
+
+- name: devtoolset_rootpath
+  type: string
+
+- name: ld_library_path_arg
+  type: string
+
+- name: prepend_path
+  type: string
+
+- name: machine_pool
+  type: string
+
+- name: extra_job_id
+  type: string
+  default: ''
+
+- name: python_wheel_suffix
+  type: string
+  default: ''
+
+
+# TODO: Ideally it should fetch information from the build that triggers it
+- name: cmake_build_type
+  type: string
+  default: 'Release'
+  values:
+   - Debug
+   - Release
+   - RelWithDebInfo
+   - MinSizeRel
+
+- name: timeout
+  type: number
+  default: 120
+
+jobs:
+- job: Linux_Test_CPU${{ parameters.extra_job_id }}_${{ parameters.arch }}
+  timeoutInMinutes: ${{ parameters.timeout }}
+  variables:
+    skipComponentGovernanceDetection: true
+  workspace:
+    clean: all
+  pool: ${{ parameters.machine_pool }}
+  steps:
+  - checkout: self
+    clean: true
+    submodules: none
+  # The public ADO project
+  - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
+    - download: current   # pipeline resource identifier.
+      artifact: 'drop-linux-cpu-${{ parameters.arch }}'
+
+    - download: current   # pipeline resource identifier.
+      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}'
+
+    - bash: |
+        set -e -x
+        mv "$(Pipeline.Workspace)/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
+        mv "$(Pipeline.Workspace)/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl"
+        cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
+        find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
+  # The private ADO project
+  - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
+    - download: build   # pipeline resource identifier.
+      artifact: 'drop-linux-cpu-${{ parameters.arch }}'
+
+    - download: build   # pipeline resource identifier.
+      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}'
+
+    - bash: |
+        set -e -x
+        ls $(Pipeline.Workspace)/build
+        mv "$(Pipeline.Workspace)/build/drop-linux-cpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
+        mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl"
+        cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
+        find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
+
+  # The BinSkim task uses a dotnet program which doesn't support ARM CPUs yet
+  - ${{ if eq(parameters.arch, 'x86_64') }}:
+    - task: BinSkim@4
+      displayName: 'Run BinSkim'
+      inputs:
+        AnalyzeTargetGlob: '$(Build.BinariesDirectory)/tmp/**/*.so'
+        continueOnError: true
+
+    #- task: PostAnalysis@2
+    #  inputs:
+    #    GdnBreakAllTools: true
+    #    GdnBreakPolicy: M365
+    #    GdnBreakPolicyMinSev: Error
+
+  - template: get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
+      Context: tools/ci_build/github/linux/docker/inference/x64/python/cpu
+      DockerBuildArgs: "--build-arg POLICY=manylinux_2_28 --build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{ parameters.base_image }} --build-arg PLATFORM=${{ parameters.arch }} --build-arg PREPEND_PATH=${{ parameters.prepend_path }} --build-arg LD_LIBRARY_PATH_ARG=${{ parameters.ld_library_path_arg }} --build-arg DEVTOOLSET_ROOTPATH=${{ parameters.devtoolset_rootpath }}"
+      Repository: onnxruntimecpubuildpython${{ parameters.arch }}
+      ${{ if eq(parameters.arch, 'aarch64') }}:
+        UpdateDepsTxt: false
+
+  - task: Bash@3
+    displayName: 'Bash Script'
+    inputs:
+      targetType: filePath
+      filePath: tools/ci_build/github/linux/run_python_dockertest.sh
+      arguments: -d CPU -c ${{parameters.cmake_build_type}} -i onnxruntimecpubuildpython${{ parameters.arch }}
+
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
new file mode 100644
index 0000000000000..43ed0172825bc
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -0,0 +1,98 @@
+parameters:
+- name: arch
+  type: string
+
+- name: device
+  type: string
+  values:
+   - CPU
+   - GPU
+
+- name: machine_pool
+  type: string
+
+- name: extra_job_id
+  type: string
+  default: ''
+
+- name: python_wheel_suffix
+  type: string
+  default: ''
+
+
+# TODO: Ideally it should fetch information from the build that triggers it
+- name: cmake_build_type
+  type: string
+  default: 'Release'
+  values:
+   - Debug
+   - Release
+   - RelWithDebInfo
+   - MinSizeRel
+
+- name: timeout
+  type: number
+  default: 120
+
+jobs:
+- job: Linux_Test_GPU${{ parameters.extra_job_id }}_${{ parameters.arch }}
+  timeoutInMinutes: ${{ parameters.timeout }}
+  variables:
+    skipComponentGovernanceDetection: true
+  workspace:
+    clean: all
+  pool: ${{ parameters.machine_pool }}
+  steps:
+  - checkout: self
+    clean: true
+    submodules: none
+  # The public ADO project
+  # - ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
+
+  # The private ADO project
+  - ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
+    - download: build   # pipeline resource identifier.
+      artifact: 'drop-linux-gpu-${{ parameters.arch }}'
+
+    - download: build   # pipeline resource identifier.
+      artifact: 'onnxruntime${{ parameters.python_wheel_suffix }}'
+
+    - bash: |
+        set -e -x
+        ls $(Pipeline.Workspace)/build
+        mv "$(Pipeline.Workspace)/build/drop-linux-gpu-${{ parameters.arch }}" $(Build.BinariesDirectory)/${{parameters.cmake_build_type}}
+        mv "$(Pipeline.Workspace)/build/onnxruntime${{ parameters.python_wheel_suffix }}" "$(Build.BinariesDirectory)/whl"
+        cp -r "$(Build.BinariesDirectory)/whl" $(Build.BinariesDirectory)/tmp
+        find "$(Build.BinariesDirectory)/tmp" -name '*.whl' -exec bash -c 'unzip -d "${1%.*}" "$1"' _ {} \;
+
+  # The BinSkim task uses a dotnet program which doesn't support ARM CPUs yet
+  - ${{ if eq(parameters.arch, 'x86_64') }}:
+    - task: BinSkim@4
+      displayName: 'Run BinSkim'
+      inputs:
+        AnalyzeTargetGlob: '$(Build.BinariesDirectory)/tmp/**/*.so'
+        continueOnError: true
+
+    #- task: PostAnalysis@2
+    #  inputs:
+    #    GdnBreakAllTools: true
+    #    GdnBreakPolicy: M365
+    #    GdnBreakPolicyMinSev: Error
+
+  - template: get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+      Context: tools/ci_build/github/linux/docker
+      DockerBuildArgs: "--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 --build-arg PREPEND_PATH=/usr/local/cuda/bin --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 --build-arg DEVTOOLSET_ROOTPATH=/usr --build-arg BUILD_UID=$( id -u ) --build-arg PLATFORM=${{ parameters.arch }}"
+      Repository: onnxruntimecuda118xtrt86build${{ parameters.arch }}
+
+  - task: Bash@3
+    displayName: 'Bash Script'
+    inputs:
+      targetType: filePath
+      filePath: tools/ci_build/github/linux/run_python_dockertest.sh
+      arguments: -d GPU -c ${{parameters.cmake_build_type}} -i onnxruntimecuda118xtrt86build${{ parameters.arch }}
+
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml
deleted file mode 100644
index 8ddc917e8591e..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-parameters:
-- name: arch
-  type: string
-
-- name: device
-  type: string
-
-- name: machine_pool
-  type: string
-
-- name: extra_job_id
-  type: string
-  default: ''
-
-- name: python_wheel_suffix
-  type: string
-  default: ''
-
-
-# TODO: Ideally it should fetch information from the build that triggers it
-- name: cmake_build_type
-  type: string
-  default: 'Release'
-  values:
-   - Debug
-   - Release
-   - RelWithDebInfo
-   - MinSizeRel
-
-- name: timeout
-  type: number
-  default: 120
-
-jobs:
-- job: Linux_Test_${{ parameters.device }}${{ parameters.extra_job_id }}_${{ parameters.arch }}
-  timeoutInMinutes: ${{ parameters.timeout }}
-  variables:
-    skipComponentGovernanceDetection: true
-  workspace:
-    clean: all
-  pool: ${{ parameters.machine_pool }}
-  steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Artifact'
-    inputs:
-      artifactName: 'drop-linux-${{ lower(parameters.device) }}-${{ parameters.arch }}'
-      targetPath: '$(Build.BinariesDirectory)/${{parameters.cmake_build_type}}'
-      # The public ADO project
-      ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
-        buildType: current
-      # The private ADO project
-      ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
-        project: '530acbc4-21bc-487d-8cd8-348ff451d2ff'
-        definition: 841
-        preferTriggeringPipeline: true
-        runVersion: 'latest'
-        buildType: specific
-
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Pipeline Artifact'
-    inputs:      
-      artifactName: 'onnxruntime${{ parameters.python_wheel_suffix }}'
-      targetPath: '$(Build.BinariesDirectory)/whl'      
-      # The public ADO project
-      ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
-        buildType: current
-      # The private ADO project
-      ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
-        project: '530acbc4-21bc-487d-8cd8-348ff451d2ff'
-        definition: 841
-        preferTriggeringPipeline: true
-        runVersion: 'latest'
-        buildType: specific
-
-
-  - task: Bash@3
-    displayName: 'Bash Script'
-    inputs:
-      targetType: filePath
-      filePath: tools/ci_build/github/linux/run_python_tests.sh
-      arguments: -d ${{ parameters.device }} -c ${{parameters.cmake_build_type}}
-
-  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-    displayName: 'Clean Agent Directories'
-    condition: always()
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index a9a1e6b39a8cb..af87852561e0a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,9 +1,9 @@
-ARG BASEIMAGE=amd64/almalinux:8
+ARG BASEIMAGE=registry.access.redhat.com/ubi8/ubi
 ARG POLICY=manylinux_2_28
 ARG PLATFORM=x86_64
 ARG DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
 ARG LD_LIBRARY_PATH_ARG=${DEVTOOLSET_ROOTPATH}/usr/lib64:${DEVTOOLSET_ROOTPATH}/usr/lib:${DEVTOOLSET_ROOTPATH}/usr/lib64/dyninst:${DEVTOOLSET_ROOTPATH}/usr/lib/dyninst:/usr/local/lib64
-ARG PREPEND_PATH=${DEVTOOLSET_ROOTPATH}/usr/bin:
+ARG PREPEND_PATH=/usr/lib/jvm/msopenjdk-11/bin:${DEVTOOLSET_ROOTPATH}/usr/bin:
 
 #Build manylinux2014 docker image begin
 FROM $BASEIMAGE AS runtime_base
@@ -26,7 +26,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -35,7 +34,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -137,9 +135,7 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
-
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
@@ -156,6 +152,7 @@ CMD ["/bin/bash"]
 #Build manylinux2014 docker image end
 
 ENV PATH ${DEVTOOLSET_ROOTPATH}/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
index dab8df6703c4f..933b0211b0e6c 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11
@@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
@@ -156,7 +153,7 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 CMD ["/bin/bash"]
 
 #Build manylinux2014 docker image end
-
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 #Add our own dependencies
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
index 303e83eb23bca..003bb2324c049 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_4
@@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
@@ -163,7 +160,7 @@ RUN v="8.4.1-1.cuda11.6" &&\
     yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\
     yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \
         libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v}
-
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 #Add our own dependencies
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
index d17e4b24582fe..0337ffc5e00a0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_6_tensorrt8_5
@@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
@@ -163,7 +160,7 @@ RUN v="8.5.1-1.cuda11.8" &&\
     yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo &&\
     yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} \
         libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v}
-
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 #Add our own dependencies
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
index 3c0ac22e38b5a..2c953a10cbf64 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda11_8_tensorrt8_6
@@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -147,7 +145,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.7.txt \
      build_scripts/requirements3.8.txt \
@@ -171,7 +168,7 @@ RUN v="8.6.1.6-1.cuda11.8" &&\
     yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo &&\
     yum -y install libnvinfer8-${v} libnvparsers8-${v} libnvonnxparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-vc-plugin8-${v}\
         libnvinfer-devel-${v} libnvparsers-devel-${v} libnvonnxparsers-devel-${v} libnvinfer-plugin-devel-${v} libnvinfer-vc-plugin-devel-${v} libnvinfer-headers-devel-${v}  libnvinfer-headers-plugin-devel-${v}
-
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 #Add our own dependencies
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 326e15d58456a..09ab7951552a0 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -31,7 +31,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -40,7 +39,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -140,7 +138,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
index c211fa9b9e2b8..83a974469234f 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
@@ -7,40 +7,30 @@
 # Build base image with required system packages
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
 
-# The local directory into which to build and install CMAKE
-ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH}
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get update &&\
-    apt-get install -y sudo git bash unattended-upgrades wget
-RUN unattended-upgrade
+    apt-get install -y git bash wget
 
 # Install python3
 RUN apt-get install -y --no-install-recommends \
     python3 \
     python3-pip \
     python3-dev \
-    python3-wheel &&\
-    cd /usr/local/bin &&\
-    ln -s /usr/bin/python3 python &&\
-    ln -s /usr/bin/pip3 pip;
+    python3-wheel 
+   
 
 RUN pip install --upgrade pip
-RUN pip install setuptools>=41.0.0
 
 # Install TensorRT
 RUN v="8.6.1.6-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
-    sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
+    apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
         libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v}  libnvinfer-dispatch-dev=${v}\
         python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v}
 
-# Install Valgrind
-RUN apt-get install -y valgrind
-
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts
 
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
index a1ade39e57e16..adb0464d6496a 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh
@@ -1,9 +1,8 @@
 #!/bin/bash
 set -e -x
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
-
-dnf install -y glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
-locale
\ No newline at end of file
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+locale
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
index 7ecd0525c7e7e..7598ab0a7a536 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh
@@ -14,20 +14,20 @@ function GetFile {
       echo "File '$path' already exists. Skipping download"
       return 0
     else
-      rm -rf $path
+      rm -rf "$path"
     fi
   fi
 
   if [[ -f $uri ]]; then
     echo "'$uri' is a file path, copying file to '$path'"
-    cp $uri $path
+    cp "$uri" "$path"
     return $?
   fi
 
   echo "Downloading $uri"
   # Use aria2c if available, otherwise use curl
   if command -v aria2c > /dev/null; then
-    aria2c -q -d $(dirname $path) -o $(basename $path) "$uri"
+    aria2c -q -d "$(dirname $path)" -o "$(basename $path)" "$uri"
   else
     curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail
   fi
@@ -38,9 +38,10 @@ mkdir -p /tmp/src
 
 cd /tmp/src
 
+CPU_ARCH=$(uname -m)
 echo "Installing cmake"
-GetFile https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz
-tar -zxf /tmp/src/cmake-3.27.3-linux-`uname -m`.tar.gz --strip=1 -C /usr
+GetFile "https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz"
+tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr
 
 echo "Installing Ninja"
 GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
@@ -52,7 +53,7 @@ mv ./build-cmake/ninja /usr/bin
 popd
 
 echo "Installing Node.js"
-CPU_ARCH=`uname -m`
+
 if [[ "$CPU_ARCH" = "x86_64" ]]; then
   NODEJS_ARCH=x64
 elif [[ "$CPU_ARCH" = "aarch64" ]]; then
@@ -64,16 +65,5 @@ fi
 GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz
 tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr
 
-# The Python version in CentOS 7's python3 package is no longer supported (3.6) so we will build Python from source.
-echo "Installing Python"
-PYTHON_VERSION="3.8.17"
-GetFile https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz /tmp/src/Python-${PYTHON_VERSION}.tgz
-tar -zxf Python-${PYTHON_VERSION}.tgz
-pushd Python-${PYTHON_VERSION}
-./configure
-make
-make install
-popd
-
 cd /
 rm -rf /tmp/src
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
index 0324f377b8e9e..caf9583807b62 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
@@ -5,10 +5,10 @@
 ARG BASEIMAGE=amd64/almalinux:8
 FROM $BASEIMAGE
 
-ENV PATH /opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
-
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
 
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
index 8e18a237a807e..b5f8bf1a49a19 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 set -e -x
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
-
-dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
-locale
\ No newline at end of file
+rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+locale
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
index 386759890d085..318791072f46d 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
@@ -4,8 +4,10 @@
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
+ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
+ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh
index 3cf259dc7240e..31e3e40f1b7ee 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/scripts/install_centos.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 set -e -x
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
-
-dnf install -y python39-devel python3-devel glibc-langpack-\* glibc-locale-source which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel
-locale
\ No newline at end of file
+rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11
+locale
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
index 33660cbb3f2e5..06e75ee1a39f6 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
@@ -26,7 +26,6 @@ COPY build_scripts/fixup-mirrors.sh /usr/local/sbin/fixup-mirrors
 
 # setup entrypoint, this will wrap commands with `linux32` with i686 images
 COPY build_scripts/install-entrypoint.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 
@@ -35,7 +34,6 @@ COPY manylinux-entrypoint /usr/local/bin/manylinux-entrypoint
 ENTRYPOINT ["manylinux-entrypoint"]
 
 COPY build_scripts/install-runtime-packages.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/build_utils.sh \
      /build_scripts/
 RUN manylinux-entrypoint /build_scripts/install-runtime-packages.sh && rm -rf /build_scripts/
@@ -132,7 +130,6 @@ COPY --from=build_git /manylinux-rootfs /
 COPY --from=build_cpython /manylinux-rootfs /
 COPY --from=all_python /opt/_internal /opt/_internal/
 COPY build_scripts/finalize.sh \
-     build_scripts/update-system-packages.sh \
      build_scripts/python-tag-abi-tag.py \
      build_scripts/requirements3.8.txt \
      build_scripts/requirements3.9.txt \
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh
index 98bb730a43776..c81e57c60c9da 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_centos.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 set -e
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for os major version : $os_major_version"
 dnf install -y glibc-langpack-\*
-yum install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+yum install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 
 # export PATH=/opt/python/cp38-cp38/bin:$PATH
 
@@ -17,4 +17,4 @@ mkdir build
 cd build
 cmake ..
 cmake --install .
-cd ../..
\ No newline at end of file
+cd ../..
diff --git a/tools/ci_build/github/linux/docker/manylinux.patch b/tools/ci_build/github/linux/docker/manylinux.patch
index f1821f9197525..75923e746f93c 100644
--- a/tools/ci_build/github/linux/docker/manylinux.patch
+++ b/tools/ci_build/github/linux/docker/manylinux.patch
@@ -94,7 +94,7 @@ index 9ef1e99..ec52833 100755
 +fi
 \ No newline at end of file
 diff --git a/install-runtime-packages.sh b/install-runtime-packages.sh
-index 137d2e2..4269afb 100755
+index 137d2e2..203b4bc 100755
 --- a/install-runtime-packages.sh
 +++ b/install-runtime-packages.sh
 @@ -33,7 +33,7 @@ source $MY_DIR/build_utils.sh
@@ -130,7 +130,7 @@ index 137d2e2..4269afb 100755
  	elif [ "${AUDITWHEEL_ARCH}" == "aarch64" ] || [ "${AUDITWHEEL_ARCH}" == "ppc64le" ] || [ "${AUDITWHEEL_ARCH}" == "s390x" ]; then
  		# Software collection (for devtoolset-10)
  		yum -y install centos-release-scl-rh
-@@ -86,19 +88,18 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
+@@ -86,19 +88,21 @@ if [ "${AUDITWHEEL_POLICY}" == "manylinux2014" ]; then
  	fi
  elif [ "${AUDITWHEEL_POLICY}" == "manylinux_2_28" ]; then
  	PACKAGE_MANAGER=dnf
@@ -148,6 +148,9 @@ index 137d2e2..4269afb 100755
 -	TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran"
 -	if [ "${AUDITWHEEL_ARCH}" == "x86_64" ]; then
 -		TOOLCHAIN_DEPS="${TOOLCHAIN_DEPS} yasm"
++	if test -f "/etc/yum.repos.d/ubi.repo"; then
++            sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo
++	fi
 +        if [[ -d /usr/local/cuda ]]; then
 +	    TOOLCHAIN_DEPS="gcc gcc-c++"
 +	else
@@ -155,7 +158,7 @@ index 137d2e2..4269afb 100755
  	fi
  elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
  	TOOLCHAIN_DEPS="binutils gcc g++ gfortran"
-@@ -121,12 +122,6 @@ else
+@@ -121,12 +125,6 @@ else
  	exit 1
  fi
  
diff --git a/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh
index b9accb134b26d..c4689ed19c148 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_dotnet.sh
@@ -2,13 +2,15 @@
 set -e -x
 
 if [ -f /etc/redhat-release ]; then
-    dnf update --refresh -y \
-    && dnf install -y dotnet-sdk-6.0
+  # If you found the following command went successfully but dotnet command still reports no sdk was found, most likely
+  # it was because the dotnet packages were installed from more than one dnf repos.
+  dnf install -y dotnet-sdk-6.0 dotnet-runtime-6.0
 elif [ -f /etc/os-release ]; then
   # Get Ubuntu version
-  declare repo_version=$(if command -v lsb_release &> /dev/null; then lsb_release -r -s; else grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"'; fi)
+  declare repo_version
+  repo_version=$(if command -v lsb_release &> /dev/null; then lsb_release -r -s; else grep -oP '(?<=^VERSION_ID=).+' /etc/os-release | tr -d '"'; fi)
   # Download Microsoft signing key and repository
-  wget https://packages.microsoft.com/config/ubuntu/$repo_version/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+  wget "https://packages.microsoft.com/config/ubuntu/$repo_version/packages-microsoft-prod.deb" -O packages-microsoft-prod.deb
   # Install Microsoft signing key and repository
   dpkg -i packages-microsoft-prod.deb
   # Clean up
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
index 4f544a50cb94d..63b953a95add6 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh
@@ -1,17 +1,18 @@
 #!/bin/bash
 set -e
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for os major version : $os_major_version"
 if [ "$os_major_version" -gt 7 ]; then
     PACKAGE_MANAGER="dnf"
-    $PACKAGE_MANAGER install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+    $PACKAGE_MANAGER install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 else
     PACKAGE_MANAGER="yum"
-    $PACKAGE_MANAGER install -y which gdb redhat-lsb-core expat-devel tar unzip zlib-devel make libunwind bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+    $PACKAGE_MANAGER install -y which redhat-lsb-core expat-devel tar unzip zlib-devel make libunwind bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
 fi
+rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
 
 # Install Java
 # Install automatic documentation generation dependencies
-$PACKAGE_MANAGER install -y java-11-openjdk-devel graphviz
+$PACKAGE_MANAGER install -y msopenjdk-11 graphviz
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
index a1cb4be5b72c9..8c79918120d8d 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
@@ -3,18 +3,20 @@ set -e -x
 
 # Development tools and libraries
 if [ -f /etc/redhat-release ]; then
-  yum update && yum -y install graphviz
-  os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+  dnf -y install graphviz
 elif [ -f /etc/os-release ]; then
   apt-get update && apt-get install -y graphviz
-  os_major_version=$(cat /etc/os-release | tr -dc '0-9.'|cut -d \. -f1)
 else
   echo "Unsupported OS"
   exit 1
 fi
 
 # Install dotnet
-source $(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)/install_dotnet.sh
+LOCAL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)"
+PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)"
+# ShellCheck is unable to follow dynamic paths, such as source "$somedir/file".
+# shellcheck disable=SC1091
+source "$PARENT_DIR/install_dotnet.sh"
 
 if [ ! -d "/opt/conda/bin" ]; then
     PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
@@ -22,23 +24,17 @@ else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
 
-SYS_LONG_BIT=$(getconf LONG_BIT)
 mkdir -p /tmp/src
-GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.)
-
-if [[ $SYS_LONG_BIT = "64" ]]; then
-  LIBDIR="lib64"
-else
-  LIBDIR="lib"
-fi
 
 cd /tmp/src
-source $(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/install_shared_deps.sh
+# shellcheck disable=SC1091
+source "$LOCAL_DIR/install_shared_deps.sh"
 
 cd /tmp/src
 
 if ! [ -x "$(command -v protoc)" ]; then
-  source ${0/%install_deps.sh/..\/install_protobuf.sh}
+# shellcheck disable=SC1091
+  source "$PARENT_DIR/install_protobuf.sh"
 fi
 
 export ONNX_ML=1
@@ -46,7 +42,7 @@ export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
 
 for PYTHON_EXE in "${PYTHON_EXES[@]}"
 do
-  ${PYTHON_EXE} -m pip install -r ${0/%install_deps\.sh/requirements\.txt}
+  ${PYTHON_EXE} -m pip install -r "${0/%install_deps\.sh/requirements\.txt}"
 done
 
 cd /
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh
index ed220b487d06c..1f85f72aef423 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_aten.sh
@@ -11,7 +11,7 @@ else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 SYS_LONG_BIT=$(getconf LONG_BIT)
 mkdir -p /tmp/src
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
index e141e0793a2bd..ad3366b0bb3b6 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
@@ -11,7 +11,7 @@ else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
 
-os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 SYS_LONG_BIT=$(getconf LONG_BIT)
 mkdir -p /tmp/src
diff --git a/tools/ci_build/github/linux/run_python_dockertest.sh b/tools/ci_build/github/linux/run_python_dockertest.sh
new file mode 100755
index 0000000000000..332dd9c7284c0
--- /dev/null
+++ b/tools/ci_build/github/linux/run_python_dockertest.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e -x
+BUILD_CONFIG="Release"
+
+while getopts "i:d:x:c:" parameter_Option
+do case "${parameter_Option}"
+in
+i) DOCKER_IMAGE=${OPTARG};;
+d) DEVICE=${OPTARG};;
+c) BUILD_CONFIG=${OPTARG};;
+esac
+done
+
+if [ $DEVICE = "GPU" ]; then
+  ADDITIONAL_DOCKER_PARAMETER="--gpus all"
+fi
+
+mkdir -p $HOME/.onnx
+docker run --rm \
+    --volume /data/onnx:/data/onnx:ro \
+    --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src \
+    --volume $BUILD_BINARIESDIRECTORY:/build \
+    --volume /data/models:/build/models:ro \
+    --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+    -w /onnxruntime_src \
+    -e NIGHTLY_BUILD \
+    -e BUILD_BUILDNUMBER \
+    $ADDITIONAL_DOCKER_PARAMETER \
+    $DOCKER_IMAGE tools/ci_build/github/linux/run_python_tests.sh -d $DEVICE -c $BUILD_CONFIG
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index c11ea42cd0541..f080c7e8c39d8 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -15,7 +15,8 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-cd $BUILD_BINARIESDIRECTORY
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+cd /build
 files=(whl/*.whl)
 FILE_NAME="${files[0]}"
 FILE_NAME=$(basename $FILE_NAME)
@@ -23,7 +24,7 @@ PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
 
 echo "Package name:$PYTHON_PACKAGE_NAME"
 
-BUILD_ARGS="--build_dir $BUILD_BINARIESDIRECTORY --config $BUILD_CONFIG --test --skip_submodule_sync --parallel --enable_lto --build_wheel "
+BUILD_ARGS="--build_dir /build --config $BUILD_CONFIG --test --skip_submodule_sync --parallel --enable_lto --build_wheel "
 
 ARCH=$(uname -m)
 
@@ -35,20 +36,15 @@ if [ $BUILD_DEVICE == "GPU" ]; then
     BUILD_ARGS="$BUILD_ARGS --use_cuda --use_tensorrt --cuda_version=11.8 --tensorrt_home=/usr --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8"
 fi
 # We assume the machine doesn't have gcc and python development header files, so we don't build onnxruntime from source
-sudo rm -rf /build /onnxruntime_src
-sudo ln -s $BUILD_SOURCESDIRECTORY /onnxruntime_src
 python3 -m pip install --upgrade pip
-python3 -m pip uninstall -y $PYTHON_PACKAGE_NAME ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml onnx -qq
 # Install the packages that are needed for installing the onnxruntime python package
-python3 -m pip install -r $BUILD_BINARIESDIRECTORY/$BUILD_CONFIG/requirements.txt
+python3 -m pip install -r /build/$BUILD_CONFIG/requirements.txt
 # Install the packages that are needed for running test scripts
-# Install the latest ONNX release which may contain not fixed bugs. However, it is what most people use.
-python3 -m pip install onnx pytest
+python3 -m pip install pytest
 # The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no 
 # ordering in the locations that are searched" if we don't disable the default one with "--no-index"
-python3 -m pip install --no-index --find-links $BUILD_BINARIESDIRECTORY/whl $PYTHON_PACKAGE_NAME
-ln -s /data/models $BUILD_BINARIESDIRECTORY
-cd $BUILD_BINARIESDIRECTORY/$BUILD_CONFIG
+python3 -m pip install --no-index --find-links /build/whl $PYTHON_PACKAGE_NAME
+cd /build/$BUILD_CONFIG
 # Restore file permissions
 xargs -a perms.txt chmod a+x
-python3 $BUILD_SOURCESDIRECTORY/tools/ci_build/build.py $BUILD_ARGS --ctest_path ''
+python3 /onnxruntime_src/tools/ci_build/build.py $BUILD_ARGS --ctest_path ''
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
old mode 100644
new mode 100755
diff --git a/tools/scripts/symbolic_shape_infer_test.sh b/tools/scripts/symbolic_shape_infer_test.sh
old mode 100644
new mode 100755

From 5b9cd91a9cddbe7c461c1ad7ca44edd5111ea920 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Thu, 21 Sep 2023 22:37:50 +0800
Subject: [PATCH 03/14] [ROCm] fix CI (#17648)

fix CI, follow #17621
---
 .../github/azure-pipelines/orttraining-pai-ci-pipeline.yml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 3333a7d22a41b..8dd1f0c5c6461 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -222,7 +222,7 @@ jobs:
     clean: all
   pool: AMD-GPU
   dependsOn:
-  - Linux_Build
+  - Linux_Build_ubuntu
   timeoutInMinutes: 120
 
   steps:

From f299016cbe87a5341e0a8aa69b621555c9d49a35 Mon Sep 17 00:00:00 2001
From: George Nash <george.nash@intel.com>
Date: Thu, 21 Sep 2023 09:25:41 -0700
Subject: [PATCH 04/14] Fix crash on Windows server 2016 on Intel Gen4 Xeon
 processors (#17611)

This adds an additional check before enabling MlasGemmU8S8DispatchAmx
for GEMM operations. After checking the CPUID for AMX-TILE and AMX-INT8,
an additional check is added that checks value of the XCR0 register.

The value in the OXR0 register is set by the OS and indicates support
for various CPU features. In this case the bits indicating XTILECFG and
XTILEDATA support are checked.

### Description
This adds an additional check before enabling MlasGemmU8S8DispatchAmx
for GEMM operations. After checking the CPUID for AMX-TILE and AMX-INT8,
an additional check is added that checks value of the XCR0 register.

The value in the OXR0 register is set by the OS and indicates support
for various CPU features. In this case the bits indicating XTILECFG and
XTILEDATA support are checked.


### Motivation and Context
Fix for crash reported directly by customer. When running older Windows
server OS on newer Gen4 Xeon processors.

Signed-off-by: Nash <george.nash@intel.com>
---
 onnxruntime/core/mlas/lib/platform.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 7e2b117d6f249..96bc1d8010bed 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -112,6 +112,14 @@ MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const int16_t MlasOpmask16BitTableAvx512[
 #define _XCR_XFEATURE_ENABLED_MASK 0
 #endif
 
+#if !defined(XFEATURE_MASK_XTILE)
+#define XFEATURE_XTILECFG 17
+#define XFEATURE_XTILEDATA 18
+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+#endif
+
 inline
 uint64_t
 MlasReadExtendedControlRegister(
@@ -142,11 +150,6 @@ bool
 MlasInitAMX()
 {
 #if defined(__linux__)
-#define XFEATURE_XTILECFG 17
-#define XFEATURE_XTILEDATA 18
-#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
-#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
-#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
 
 #define ARCH_GET_XCOMP_PERM 0x1022
 #define ARCH_REQ_XCOMP_PERM 0x1023
@@ -417,7 +420,9 @@ Return Value:
                 // Check if the processor supports AMX-TILE and AMX-INT8
                 // features.
                 //
-                if ((Cpuid7[3] & 0b1 << 24) != 0 && (Cpuid7[3] & 0b1 << 25) != 0) {
+                if ((Cpuid7[3] & 0b1 << 24) != 0 &&
+                    (Cpuid7[3] & 0b1 << 25) != 0 &&
+                    (xcr0 & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) {
                     if (MlasInitAMX()) {
                         this->GemmU8U8Dispatch = &MlasGemmU8S8DispatchAmx;
                         this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchAmx;

From d56fc7ebf5377abc96db728eafaffd8bf79a3b81 Mon Sep 17 00:00:00 2001
From: Abhishek Jindal <abjindal@microsoft.com>
Date: Thu, 21 Sep 2023 14:16:41 -0700
Subject: [PATCH 05/14] Layer norm fusion deepspeed stage3 changes (#17614)

### Description
<!-- Describe your changes. -->
Layer norm fusion changes required for deepspeed stage 3, also includes
test case.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
It helps fusing layer norm for Deepspeed Stage 3. Added a test case
scenario which ensures that the fusion is working properly for the
scenario.
---
 .../core/optimizer/layer_norm_fusion.cc       |  42 ++++-----
 .../graph_transform_test_layernorm.cc         |  34 ++++++++
 .../fusion/layer_norm_fusion_scale_bias.onnx  | Bin 0 -> 854 bytes
 .../fusion/layer_norm_fusion_scale_bias.py    |  81 ++++++++++++++++++
 4 files changed, 136 insertions(+), 21 deletions(-)
 create mode 100644 onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx
 create mode 100644 onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py

diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index bf36f11521be2..159e3b23d1ab0 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -414,20 +414,20 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     NodeArg* scale = nullptr;
     NodeArg* bias = nullptr;
     for (size_t i = 0; i < mul_node.MutableInputDefs().size(); i++) {
-      if (graph_utils::NodeArgIsConstant(graph, *(mul_node.MutableInputDefs()[i])) ||
-          graph_utils::IsGraphInput(graph, mul_node.MutableInputDefs()[i])) {
-        if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
-          scale = mul_node.MutableInputDefs()[i];
-        }
+      if (mul_node.MutableInputDefs()[i]->Shape() == nullptr) {
+        continue;
+      }
+      if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
+        scale = mul_node.MutableInputDefs()[i];
       }
     }
 
     for (size_t i = 0; i < last_add_node.MutableInputDefs().size(); i++) {
-      if (graph_utils::NodeArgIsConstant(graph, *(last_add_node.MutableInputDefs()[i])) ||
-          graph_utils::IsGraphInput(graph, last_add_node.MutableInputDefs()[i])) {
-        if (last_add_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
-          bias = last_add_node.MutableInputDefs()[i];
-        }
+      if (last_add_node.MutableInputDefs()[i]->Shape() == nullptr) {
+        continue;
+      }
+      if (last_add_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
+        bias = last_add_node.MutableInputDefs()[i];
       }
     }
     if (scale == nullptr || bias == nullptr) {
@@ -667,20 +667,20 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
     // because SkipLayerNorm kernel, for example, has dependency on single dim size
     NodeArg* scale = nullptr;
     for (size_t i = 0; i < mul_node.MutableInputDefs().size(); i++) {
-      if (graph_utils::NodeArgIsConstant(graph, *(mul_node.MutableInputDefs()[i])) ||
-          graph_utils::IsGraphInput(graph, mul_node.MutableInputDefs()[i])) {
+      if (mul_node.MutableInputDefs()[i]->Shape() == nullptr) {
+        continue;
+      }
 #ifdef ENABLE_TRAINING_CORE
-        if (axes_values.empty() ||
-            mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
-          scale = mul_node.MutableInputDefs()[i];
-        }
+      if (axes_values.empty() ||
+          mul_node.MutableInputDefs()[i]->Shape()->dim_size() == static_cast<int>(axes_values.size())) {
+        scale = mul_node.MutableInputDefs()[i];
+      }
 #else
-        // Scale must be 1d.
-        if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == 1) {
-          scale = mul_node.MutableInputDefs()[i];
-        }
-#endif
+      // Scale must be 1d.
+      if (mul_node.MutableInputDefs()[i]->Shape()->dim_size() == 1) {
+        scale = mul_node.MutableInputDefs()[i];
       }
+#endif
     }
 
     if (scale == nullptr) {
diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
index 1f671e90090ba..a55238396cea3 100755
--- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
@@ -429,6 +429,40 @@ TEST_F(GraphTransformationTests, SimplifiedLayerNormFusionTest) {
   }
 }
 
+// It tests the scenario when scale or bias are not Graph Inputs and not initialized in Graph
+// To test this added a Identity node after Scale and Bias terms to ensure LayerNormFusion works properly
+TEST_F(GraphTransformationTests, LayerNormScaleBiasTest) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/layer_norm_fusion_scale_bias.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<LayerNormFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["ReduceMean"], 0);
+  ASSERT_EQ(op_to_count["Sub"], 0);
+  ASSERT_EQ(op_to_count["Cast"], 0);
+  ASSERT_EQ(op_to_count["Pow"], 0);
+  ASSERT_EQ(op_to_count["Add"], 0);
+  ASSERT_EQ(op_to_count["Sqrt"], 0);
+  ASSERT_EQ(op_to_count["Div"], 0);
+  ASSERT_EQ(op_to_count["Mul"], 0);
+  ASSERT_EQ(op_to_count["LayerNormalization"], 1);
+
+  for (const Node& node : graph.Nodes()) {
+    if (node.OpType() == "LayerNormalization") {
+      // LayerNormalization should have three inputs.
+      EXPECT_EQ(node.InputDefs().size(), 3u) << "LayerNormalization number of inputs does not equal to 3. Got:" << node.InputDefs().size();
+      // LayerNormalization input "scale" and "bias" should have the same dimension.
+      const TensorShapeProto* scale_shape = node.InputDefs()[1]->Shape();
+      EXPECT_EQ(scale_shape->dim_size(), 1) << "LayerNormalization scale should be 1D. Got: " << scale_shape->dim_size();
+    }
+  }
+}
+
 // If EP is non-GPU EP or unknown, the sub-graph will be not fused because CPU impl for SimplifiedLayerNormalization
 // doesn't support input and scale having different data types.
 TEST_F(GraphTransformationTests, SimplifiedLayerNormWithCastsFusionTest) {
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..ec0f9a97815b888701198d94c92e3f61c581d6dc
GIT binary patch
literal 854
zcmbVLO;6h}7_J-BxG#kTYZE93gnT22m1Yu$ooG5~nzW*c-nc|*?V^aLfqd|B%VCEd
z_!0b!9ry|SC$N(kk+Bn&96fr!@;r}i(*63k1K$A+X(!=+oM*O~2%gWxfWb)##v)ic
z9{~q9B0YN23*95r`2gfxhzlM@>6Q$%VOtJ@dJr|!d|FO4Bw)rQpTZvWW<i?ybq2^q
zeC>xz-=(HP>i32O%=i^w!!hU}H52Z>Cg;9~+&<_rur`aAl7<+#{``weNx=D_oR1Y^
z#*lN^ftN5P>1C2t1qv}dk>9s!bQLvucvY#9fEnMyE9gV-EQq4O4@;YCBkDS8M){&@
zkboKEd;z<lgJ9Kk5B>SzP?b?MvK3XgqUwV7nl}8kiFTXek@Vf^LOYAAqdEXhvhLB8
zJ7tgC=m2%NeOM_K(1s9uUCR>7EX-~h`N1m$Ln*TIxg<{C$gn@X&P!+h9YMQ4gIkdt
z$4TT+3o+bkwT`@(TjOl1*yF?9p4U84XNzD99K0cy*C0`6R*RdWK*euV{6StN>vU7S
p0tyxZ+JiQ+<ld1RPi12Czl4XOW%axbb)BNmQ8-KDG@fS`dIpt9{cQjM

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py
new file mode 100644
index 0000000000000..a59e263763a3d
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_fusion_scale_bias.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import onnx
+from onnx import OperatorSetIdProto, TensorProto, helper
+
+
+def GenerateModel(model_name, has_casts=False, has_identity=False):  # noqa: N802
+    nodes = [  # LayerNorm subgraph
+        helper.make_node("ReduceMean", ["A"], ["rd_out"], "reduce1", axes=[-1], keepdims=1),
+        helper.make_node("Sub", ["A", "rd_out"], ["sub_out"], "sub"),
+        helper.make_node("Pow", ["cast_sub_out" if has_casts else "sub_out", "pow_in_2"], ["pow_out"], "pow"),
+        helper.make_node("ReduceMean", ["pow_out"], ["rd2_out"], "reduce2", axes=[-1], keepdims=1),
+        helper.make_node("Add", ["rd2_out", "const_e12_f32"], ["add1_out"], "add1"),
+        helper.make_node("Sqrt", ["add1_out"], ["sqrt_out"], "sqrt"),
+        helper.make_node("Div", ["cast_sub_out" if has_casts else "sub_out", "sqrt_out"], ["div_out"], "div"),
+        helper.make_node(
+            "Mul",
+            ["gamma_id_out" if has_identity else "gamma", "cast_div_out" if has_casts else "div_out"],
+            ["mul_out"],
+            "mul",
+        ),
+        helper.make_node("Add", ["mul_out", "const_e6_f16_out" if has_identity else "const_e6_f16"], ["C"], "add2"),
+    ]
+
+    if has_casts:
+        nodes.extend(
+            [
+                helper.make_node("Cast", ["sub_out"], ["cast_sub_out"], "cast_sub", to=1),
+                helper.make_node("Cast", ["div_out"], ["cast_div_out"], "cast_2", to=10),
+            ]
+        )
+
+    if has_identity:
+        nodes.extend(
+            [
+                helper.make_node("Identity", ["gamma"], ["gamma_id_out"], "gamma_identity"),
+                helper.make_node("Identity", ["const_e6_f16"], ["const_e6_f16_out"], "const_e6_f16_identity"),
+            ]
+        )
+
+    initializers = [  # initializers
+        helper.make_tensor("pow_in_2", TensorProto.FLOAT, [], [2]),
+        helper.make_tensor("const_e12_f32", TensorProto.FLOAT, [], [1e-12]),
+        helper.make_tensor("const_e6_f16", TensorProto.FLOAT16, [4], [1e-6, 1e-6, 1e-6, 1e-6]),
+        helper.make_tensor(
+            "gamma",
+            TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT,
+            [4],
+            [1, 2, 3, 4],
+        ),
+    ]
+
+    input_type = TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT
+    output_type = TensorProto.FLOAT16 if has_casts else TensorProto.FLOAT
+
+    graph = helper.make_graph(
+        nodes,
+        "LayerNorm",  # name
+        [  # inputs
+            helper.make_tensor_value_info("A", input_type, [16, 32, 4]),
+        ],
+        [  # outputs
+            helper.make_tensor_value_info("C", output_type, [16, 32, 4]),
+        ],
+        initializers,
+    )
+
+    onnxdomain = OperatorSetIdProto()
+    onnxdomain.version = 12
+    # The empty string ("") or absence of this field implies the operator set that is defined as part of the ONNX specification.
+    onnxdomain.domain = ""
+    msdomain = OperatorSetIdProto()
+    msdomain.version = 1
+    msdomain.domain = "com.microsoft"
+    opsets = [onnxdomain, msdomain]
+
+    model = helper.make_model(graph, opset_imports=opsets)
+    onnx.save(model, model_name)
+
+
+GenerateModel("layer_norm_fusion_scale_bias.onnx", True, True)

From 498b60d8a47b398fefeb1847cadde07bef6e99fa Mon Sep 17 00:00:00 2001
From: Arthur Islamov <arthur@islamov.ai>
Date: Fri, 22 Sep 2023 01:52:13 +0400
Subject: [PATCH 06/14] [js/web] fp16 Pool & Reduce (#17512)

### Description
Two more ops to support fp16
---
 js/web/lib/wasm/jsep/webgpu/ops/pool.ts       |   6 +-
 js/web/lib/wasm/jsep/webgpu/ops/reduce.ts     |  14 +-
 .../providers/js/js_execution_provider.cc     | 256 +++++++++---------
 .../core/providers/js/operators/pool.cc       | 112 ++++----
 .../core/providers/js/operators/pool.h        |   8 +-
 .../core/providers/js/operators/reduce.cc     |  28 +-
 .../core/providers/js/operators/reduce.h      |   2 +-
 7 files changed, 206 insertions(+), 220 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 8c8c12fc54ddb..120a0e9de5490 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
@@ -22,9 +21,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   if (inputs[0].dims.length !== 4) {
     throw new Error('Pool ops supports 2-D inputs only for now.');
   }
-  if (inputs[0].dataType !== DataType.float) {
-    throw new Error('Invalid input type.');
-  }
 };
 
 const getAdjustedPoolAttributesAndOutputShape = <AttributeType extends AveragePoolAttributes|MaxPoolAttributes>(
@@ -248,7 +244,7 @@ const createAveragePoolProgramInfo =
           const kernelSize = ShapeUtil.size(adjustedAttributes.kernelShape);
 
           const x = inputVariable('x', input.dataType, input.dims);
-          const dataType = 'f32';
+          const dataType = x.type.value;
 
           const op1 = 'value += x_val;';
           let op2 = '';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
index 0b8d03ea73b6b..598b1db033c61 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
@@ -17,10 +17,6 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   if (inputs.length === 2 && inputs[1].dims.length !== 1) {
     throw new Error('Invalid axes input dims.');
   }
-
-  if (inputs[0].dataType !== DataType.float) {
-    throw new Error('Invalid input type.');
-  }
 };
 
 export interface ReduceAttributes extends AttributeWithCacheKey {
@@ -161,7 +157,7 @@ export const reduceL1 = (context: ComputeContext, attributes: ReduceAttributes):
 export const reduceL2 = (context: ComputeContext, attributes: ReduceAttributes): void => {
   validateInputs(context.inputs);
   const reduceOp: ReduceOp = (input, output) =>
-      [`var t = f32(0); var value = ${output.type.storage}(0);`,
+      [`var t = ${output.type.value}(0); var value = ${output.type.value}(0);`,
        '',
        `t = ${input.getByOffset('inputOffset')}; value += (t * t);`,
        'value = sqrt(value);',
@@ -212,10 +208,10 @@ export const reduceMean = (context: ComputeContext, attributes: ReduceAttributes
     }
 
     return [
-      `var value = ${output.type.storage}(0);`,
+      'var sum = f32(0);',
       '',
-      `value += ${input.getByOffset('inputOffset')};`,
-      `value = value / ${size}.;`,
+      `sum += f32(${input.getByOffset('inputOffset')});`,
+      `let value = ${output.type.value}(sum / ${size});`,
     ];
   };
   context.compute(createReduceProgramInfoLoader(context.inputs, 'ReduceMean', attributes, reduceOp), {inputs: [0]});
@@ -266,7 +262,7 @@ export const reduceSum = (context: ComputeContext, attributes: ReduceAttributes)
 export const reduceSumSquare = (context: ComputeContext, attributes: ReduceAttributes): void => {
   validateInputs(context.inputs);
   const reduceOp: ReduceOp = (input, output) =>
-      [`var t = f32(0); var value = ${output.type.storage}(0);`,
+      [`var t = ${output.type.value}(0); var value = ${output.type.value}(0);`,
        '',
        `t = ${input.getByOffset('inputOffset')}; value += t * t;`,
        '',
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 0674fe02d093d..72e36a161e9aa 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -129,56 +129,56 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 14, Rel
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, 15, LeakyRelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, LeakyRelu);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMax);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMean);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMean);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMin);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceProd);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceProd);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceProd);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceProd);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSum);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ReduceSum);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL1);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL1);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL1);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL1);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL2);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL2);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL2);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL2);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSum);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSum);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceSumSquare);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare);
-
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSumExp);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMax);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMean);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceMean);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMean);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMean);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMin);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceProd);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceProd);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceProd);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceProd);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSum);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSum);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, ReduceSum);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL1);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL1);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL1);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL1);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL2);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL2);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL2);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL2);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSum);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSum);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceSumSquare);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceSumSquare);
+
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSumExp);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSumExp);
 
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, ThresholdedRelu);
 
@@ -234,11 +234,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tra
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv);
@@ -251,16 +251,16 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gem
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax);
@@ -438,71 +438,71 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Squeeze)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMax)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMean)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceMean)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMean)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMean)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMean)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Unsqueeze)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceMin)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMin)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, float, ReduceMin)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMin)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceMin)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceProd)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceProd)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceProd)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceProd)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSum)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSum)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ReduceSum)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL1)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL1)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL1)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL1)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceL2)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceL2)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceL2)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceL2)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSum)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSum)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSum)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceSumSquare)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceSumSquare)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceSumSquare)>,
-
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSumExp)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSumExp)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSumExp)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, float, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, 12, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceMin)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceProd)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceProd)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, ReduceSum)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL1)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL1)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceL2)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceL2)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSum)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceSumSquare)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceSumSquare)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, ReduceLogSumExp)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, ReduceLogSumExp)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, Conv)>,
@@ -515,16 +515,16 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalAveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, float, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax)>,
diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc
index 03e6caef7e5b8..7fdb4e5d114ea 100644
--- a/onnxruntime/core/providers/js/operators/pool.cc
+++ b/onnxruntime/core/providers/js/operators/pool.cc
@@ -8,69 +8,65 @@
 namespace onnxruntime {
 namespace js {
 
-#define POOLING_KERNEL(op_name, domain, is_channels_last, data_type, pool_type, since_version)     \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
-      op_name,                                                                                     \
-      domain,                                                                                      \
-      since_version,                                                                               \
-      data_type,                                                                                   \
-      kJsExecutionProvider,                                                                        \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
-      Pool<data_type, pool_type, is_channels_last>);
+#define POOLING_KERNEL(op_name, domain, is_channels_last, pool_type, since_version) \
+  ONNX_OPERATOR_KERNEL_EX(                                                          \
+      op_name,                                                                      \
+      domain,                                                                       \
+      since_version,                                                                \
+      kJsExecutionProvider,                                                         \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), \
+      Pool<pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                            \
-      op_name,                                                                                                        \
-      domain,                                                                                                         \
-      since_version,                                                                                                  \
-      end_version,                                                                                                    \
-      data_type,                                                                                                      \
-      kJsExecutionProvider,                                                                                           \
-      (*KernelDefBuilder::Create())                                                                                   \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                                             \
-      Pool<data_type, pool_type, is_channels_last>);
+#define POOLING_KERNEL_VERSIONED(op_name, domain, is_channels_last, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                                       \
+      op_name,                                                                                             \
+      domain,                                                                                              \
+      since_version,                                                                                       \
+      end_version,                                                                                         \
+      kJsExecutionProvider,                                                                                \
+      (*KernelDefBuilder::Create())                                                                        \
+          .TypeConstraint("T", JsepSupportedFloatTypes()),                                                 \
+      Pool<pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version) \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                            \
-      op_name,                                                                                              \
-      domain,                                                                                               \
-      since_version,                                                                                        \
-      data_type,                                                                                            \
-      kJsExecutionProvider,                                                                                 \
-      (*KernelDefBuilder::Create())                                                                         \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                    \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                     \
-      Pool<data_type, pool_type, is_channels_last>);
+#define POOLING_KERNEL_WITH_INDICES(op_name, domain, is_channels_last, pool_type, since_version) \
+  ONNX_OPERATOR_KERNEL_EX(                                                                       \
+      op_name,                                                                                   \
+      domain,                                                                                    \
+      since_version,                                                                             \
+      kJsExecutionProvider,                                                                      \
+      (*KernelDefBuilder::Create())                                                              \
+          .TypeConstraint("T", JsepSupportedFloatTypes())                                        \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                          \
+      Pool<pool_type, is_channels_last>);
 
-#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_channels_last, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                                         \
-      op_name,                                                                                                                     \
-      domain,                                                                                                                      \
-      since_version,                                                                                                               \
-      end_version,                                                                                                                 \
-      data_type,                                                                                                                   \
-      kJsExecutionProvider,                                                                                                        \
-      (*KernelDefBuilder::Create())                                                                                                \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                                           \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                                            \
-      Pool<data_type, pool_type, is_channels_last>);
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, domain, is_channels_last, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                                                    \
+      op_name,                                                                                                          \
+      domain,                                                                                                           \
+      since_version,                                                                                                    \
+      end_version,                                                                                                      \
+      kJsExecutionProvider,                                                                                             \
+      (*KernelDefBuilder::Create())                                                                                     \
+          .TypeConstraint("T", JsepSupportedFloatTypes())                                                               \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                                 \
+      Pool<pool_type, is_channels_last>);
 
-POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 7, 9)
-POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, float, AveragePool, 10, 10)
-POOLING_KERNEL(AveragePool, kOnnxDomain, false, float, AveragePool, 11)
-POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 11)
-POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, float, AveragePool, 1)
-POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, float, AveragePool, 1)
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 10, 10)
+POOLING_KERNEL(AveragePool, kOnnxDomain, false, AveragePool, 11)
+POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 11)
+POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, AveragePool, 1)
+POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, AveragePool, 1)
 
-POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, float, MaxPool<1>, 1, 7)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 8, 9)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 10, 10)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 11, 11)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 11, 11)
-POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, float, MaxPool<8>, 12)
-POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, float, MaxPool<8>, 12)
-POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, float, MaxPool<1>, 1)
-POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, float, MaxPool<1>, 1)
+POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 11, 11)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 11, 11)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 12)
+POOLING_KERNEL(GlobalMaxPool, kOnnxDomain, false, MaxPool<1>, 1)
+POOLING_KERNEL(GlobalMaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1)
 
 }  // namespace js
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pool.h b/onnxruntime/core/providers/js/operators/pool.h
index 5dbe5d0b8881d..5723123c0c3b8 100644
--- a/onnxruntime/core/providers/js/operators/pool.h
+++ b/onnxruntime/core/providers/js/operators/pool.h
@@ -41,7 +41,7 @@ namespace js {
 #define GLOBAL_POOL_ATTRIBUTES_JS_OBJ_MAPPING ({"format" : $1 ? "NHWC" : "NCHW"})
 #define GLOBAL_POOL_ATTRIBUTES_PARAM_LIST static_cast<int32_t>(is_channels_last)
 
-template <typename T, typename PoolType, bool is_channels_last>
+template <typename PoolType, bool is_channels_last>
 class Pool : public JsKernel, public PoolBase {
  public:
   Pool(const OpKernelInfo& info) : JsKernel(info), PoolBase(info) {
@@ -65,10 +65,10 @@ class Pool : public JsKernel, public PoolBase {
   }
 };
 
-template <typename T, bool is_channels_last>
-class Pool<T, MaxPool<8>, is_channels_last> final : public Pool<T, MaxPool<1>, is_channels_last> {
+template <bool is_channels_last>
+class Pool<MaxPool<8>, is_channels_last> final : public Pool<MaxPool<1>, is_channels_last> {
  public:
-  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, is_channels_last>(info) {}
+  Pool(const OpKernelInfo& info) : Pool<MaxPool<1>, is_channels_last>(info) {}
 };
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/reduce.cc b/onnxruntime/core/providers/js/operators/reduce.cc
index 21854fccc37ca..2679cfed86124 100644
--- a/onnxruntime/core/providers/js/operators/reduce.cc
+++ b/onnxruntime/core/providers/js/operators/reduce.cc
@@ -7,32 +7,30 @@ namespace onnxruntime {
 namespace js {
 
 #define REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, sinceVersion, endVersion) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                               \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                     \
       ReduceOp,                                                                          \
       kOnnxDomain,                                                                       \
       sinceVersion, endVersion,                                                          \
-      float,                                                                             \
       kJsExecutionProvider,                                                              \
       (*KernelDefBuilder::Create())                                                      \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),                    \
-      ReduceOp<float>);
+          .TypeConstraint("T", JsepSupportedFloatTypes()),                               \
+      ReduceOp<true>);
 
 // macro REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL does not set .InputMemoryType(OrtMemTypeCPU, 1), so in future if
 // a new opset version update applies to Reduce* operators, we may need to add another macro like
 // REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT to set input memory type.
 // i.e. we cannot use REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL to version 18 when the opset version is increased.
 
-#define REGISTER_REDUCE_ELEMENTWISE_KERNEL(ReduceOp, sinceVersion)   \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                     \
-      ReduceOp,                                                      \
-      kOnnxDomain,                                                   \
-      sinceVersion,                                                  \
-      float,                                                         \
-      kJsExecutionProvider,                                          \
-      (*KernelDefBuilder::Create())                                  \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()) \
-          .InputMemoryType(OrtMemTypeCPU, 1),                        \
-      ReduceOp<float>);
+#define REGISTER_REDUCE_ELEMENTWISE_KERNEL(ReduceOp, sinceVersion) \
+  ONNX_OPERATOR_KERNEL_EX(                                         \
+      ReduceOp,                                                    \
+      kOnnxDomain,                                                 \
+      sinceVersion,                                                \
+      kJsExecutionProvider,                                        \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T", JsepSupportedFloatTypes())          \
+          .InputMemoryType(OrtMemTypeCPU, 1),                      \
+      ReduceOp<true>);
 
 REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
 REGISTER_REDUCE_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index 19a6d298c7696..a5a4aa834c2ca 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -9,7 +9,7 @@
 namespace onnxruntime {
 namespace js {
 #define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                              \
-  template <typename T, bool allow_multi_axes = true>                                                        \
+  template <bool allow_multi_axes = true>                                                                    \
   class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                          \
    public:                                                                                                   \
     using ReduceKernelBase<allow_multi_axes>::axes_;                                                         \

From 6b7bce5ec992f2b3333ee22066201f53e7978faf Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Fri, 22 Sep 2023 08:54:25 +0800
Subject: [PATCH 07/14] Model post process for zero stage3 training (#17187)

### Model post process for zero stage3 training

This is the last change to make single GPU/Multiple GPUs run pass.

Design details:
https://microsoft.sharepoint.com/:p:/t/ONNX2/EfNfJ43necpIoPI6x5M2zvYBVbfjoPQmG4Boc_F7-tHm1w?e=ekQwA6&nav=eyJzSWQiOjMxNiwiY0lkIjoxMDE1Nzg3NDZ9

`PyTorch` runs with ZeROOffloadSubscriber:

```
  model = prepare_model(...)
  from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
  configure_ort_compatible_zero_stage3()
```

`ORTModule` runs with ZeROOffloadSubscriber:

```
  os.environ['ORTMODULE_ENABLE_ZERO_STAGE3'] = '1'
  from onnxruntime.training.ortmodule import ORTModule
  model = ORTModule(self.model)
```

It will be fairly easy to debug convergence issue if both ORT and
PyTorch can run the same offload path.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../_custom_autograd_function_exporter.py     |  28 +-
 .../_custom_autograd_function_runner.py       |  10 +
 .../ortmodule/_graph_execution_manager.py     |  62 +++-
 .../training/ortmodule/_inference_manager.py  |   4 +
 .../python/training/ortmodule/_io.py          |   8 +-
 .../training/ortmodule/_training_manager.py   |   4 +
 .../ortmodule/_zero_stage3_compatibility.py   | 312 ++++++++++++++++++
 .../python/training/utils/__init__.py         |   3 +-
 .../utils/hooks/_statistics_subscriber.py     | 171 +++++-----
 .../utils/hooks/_subscriber_manager.py        |  17 +-
 .../utils/hooks/_zero_offload_subscriber.py   | 155 ++++++---
 .../python/training/utils/torch_type_map.py   |   9 +
 .../torch_custom_function_kernel_base.cc      |   7 +-
 13 files changed, 619 insertions(+), 171 deletions(-)
 create mode 100644 orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py

diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index 4c72b6d98a088..f75d553a5f460 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -28,7 +28,8 @@ class PythonOpShapeInferStore:
 
     @classmethod
     def register(cls, kclass: torch.autograd.Function) -> None:
-        """Register a shape inference function for a torch.autograd.Function if there is staticmethod "infer_shape" defined.
+        """Register a shape inference function for a torch.autograd.Function if there is staticmethod
+        "infer_shape" defined.
 
         The signature of the shape inference function should be:
             @staticmethod
@@ -51,6 +52,11 @@ def infer_shape(
         if hasattr(kclass, "infer_shape") and kclass_name not in cls._CLASS_MAP:
             cls._CLASS_MAP[kclass_name] = kclass.infer_shape
 
+    @classmethod
+    def register_func(cls, name: str, func: Callable) -> None:
+        """Register a shape inference function for a torch.autograd.Function by name."""
+        cls._CLASS_MAP[name] = func
+
     @classmethod
     def get_shape_infer(cls, name: str) -> Optional[Callable]:
         return cls._CLASS_MAP.get(name, None)
@@ -228,9 +234,9 @@ def _export_pt_1_10(g, n, *args, **kwargs):
                 input_float_tuples.extend(list(arg))
                 continue
 
-            is_inspect_activation = (
-                func_full_qual_name == "onnxruntime.training.utils.hooks._subscriber_manager._InspectActivation"
-            )
+            from onnxruntime.training.utils.hooks._statistics_subscriber import _InspectActivation
+
+            is_inspect_activation = func_full_qual_name == get_fully_qualified_class_name(_InspectActivation)
             if is_inspect_activation and isinstance(arg, str):
                 # _InspectActivation is a special case where the first argument is a string
                 # that is used to determine the activation name to be inspected.
@@ -307,14 +313,7 @@ def _export_pt_1_10(g, n, *args, **kwargs):
 _export = wrap_custom_export_function(_export_pt_1_10)
 
 
-def _post_process_after_export(exported_model: ModelProto, enable_custom_autograd_function: bool) -> ModelProto:
-    """Post process the exported model."""
-    if enable_custom_autograd_function:
-        exported_model = _post_process_enabling_autograd_function(exported_model)
-    return exported_model
-
-
-def _post_process_enabling_autograd_function(exported_model: ModelProto) -> ModelProto:
+def post_process_enabling_autograd_function(exported_model: ModelProto) -> ModelProto:
     # Loop all PythonOp, append "_ctx" as the first output.
     index = 0
     for node in exported_model.graph.node:
@@ -330,8 +329,7 @@ def _post_process_enabling_autograd_function(exported_model: ModelProto) -> Mode
                     op_name_prefix = kclass_name
                     break
 
-        if not node.name:
-            node.name = f"{op_name_prefix}_id_{index}"
-            index += 1
+        node.name = f"{op_name_prefix}_id_{index}"
+        index += 1
 
     return exported_model
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
index 845c7d83c2e7b..a5b96c4e37140 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
@@ -376,6 +376,16 @@ def wrap_all_outputs(result):
             result = backward_function(*wrapped_args)
 
             # Extract results as DLPack tensor list.
+            if isinstance(result, torch.Tensor):
+                result = [result]
+            elif isinstance(result, (tuple, list)):
+                result = list(result)
+            else:
+                raise wrap_exception(
+                    ORTModuleIOError,
+                    TypeError(f"ORTModule does not support the following model output type {type(result)}."),
+                )
+
             wrapped_returned_args = wrap_all_outputs(result)
 
             torch_interop_utils.unregister_grad_fn(id(ctx))
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 2227b630aee23..dfaac5f0fa836 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -19,11 +19,10 @@
 import onnxruntime
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
-from onnxruntime.training.utils import ORTModelInputOutputSchemaType
+from onnxruntime.training.utils import ORTModelInputOutputSchemaType, onnx_dtype_to_pytorch
 from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils
-from ._custom_autograd_function_exporter import _post_process_after_export
 from ._fallback import (
     ORTModuleDeviceException,
     ORTModuleONNXModelException,
@@ -141,9 +140,14 @@ def __init__(
 
             register_triton_op_executor()
 
+        self._zero_stage3_param_map = {}
         if self._runtime_options.enable_zero_stage3_support:
             # Cannot toggle feature enabling/disabling after the first time enabled.
-            configure_ort_compatible_zero_stage3()
+            from onnxruntime.training.utils.hooks._zero_offload_subscriber import _get_all_zero_stage3_params
+
+            self._zero_stage3_param_map = _get_all_zero_stage3_params(self._flattened_module)
+
+            configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)
 
     def _get_torch_gpu_allocator_function_addresses(self):
         if self._runtime_options.use_external_gpu_allocator and torch.cuda.is_available():
@@ -345,7 +349,8 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
             )
             if os.path.exists(cache_dir) and os.path.isfile(filename):
                 self._logger.info(
-                    f"Cached model detected! Cached model will be used to save export and initialization time. If you want the model to be re-exported then DELETE {filename}."
+                    f"Cached model detected! Cached model will be used to save export and initialization time."
+                    f"If you want the model to be re-exported then DELETE {filename}."
                 )
                 exported_model = onnx.load(filename)
                 return exported_model
@@ -409,9 +414,24 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
             )
         exported_model = onnx.load_model_from_string(f.getvalue())
 
-        exported_model = _post_process_after_export(
-            exported_model, self._runtime_options.enable_custom_autograd_function
-        )
+        if self._runtime_options.enable_custom_autograd_function:
+            from ._custom_autograd_function_exporter import post_process_enabling_autograd_function
+
+            exported_model = post_process_enabling_autograd_function(exported_model)
+
+        if self._runtime_options.enable_zero_stage3_support:
+            from ._zero_stage3_compatibility import post_processing_enable_zero_stage3_compat
+
+            exported_model = post_processing_enable_zero_stage3_compat(
+                exported_model,
+                self._zero_stage3_param_map,
+                [name for name, _ in self._flattened_module.named_parameters()],
+            )
+
+            # Cannot append pull weight trigger name to input names as following, otherwise, the later check (
+            # https://github.com/microsoft/onnxruntime/blob/068300d97eb25e5b52324e7af54a45ed1fa6a4c3/orttraining/orttraining/python/training/ortmodule/_training_manager.py#L466C18-L466C18)
+            # find input info mismatch, will re-initialize the graph builder.
+            # self._input_info.require_grad_names.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME)
 
         # Cache model for future runs
         if cache_dir:
@@ -477,7 +497,14 @@ def _initialize_graph_builder(self):
         grad_builder_config = C.OrtModuleGraphBuilderConfiguration()
         grad_builder_config.initializer_names = initializer_names
         grad_builder_config.initializer_names_to_train = initializer_names_to_train
-        grad_builder_config.input_names_require_grad = self._input_info.require_grad_names
+
+        input_names_require_grad = self._input_info.require_grad_names
+        if self._runtime_options.enable_zero_stage3_support:
+            from ._zero_stage3_compatibility import STAGE3_PULL_WEIGHT_TRIGGER_NAME
+
+            # Add stage3 pull weight trigger name to require_grad_names, so that it will be included in the gradient graph.
+            input_names_require_grad.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME)
+        grad_builder_config.input_names_require_grad = input_names_require_grad
         grad_builder_config.build_gradient_graph = self._export_mode == torch.onnx.TrainingMode.TRAINING
         grad_builder_config.enable_caching = self._runtime_options.enable_grad_acc_optimization
         grad_builder_config.loglevel = _logger.ortmodule_loglevel_to_onnxruntime_c_loglevel(
@@ -553,6 +580,9 @@ def _enable_conditional_optimizations(
                     inputs, kwargs
                 )
 
+                if self._runtime_options.enable_zero_stage3_support:
+                    self._append_pull_weight_trigger_as_input(kwargs, detected_device)
+
                 _, embed_sparsity_results, label_sparsity_results = _io._combine_input_buffers_initializers(
                     self._graph_initializers,
                     self._graph_builder.get_graph_info().user_input_names,
@@ -562,6 +592,7 @@ def _enable_conditional_optimizations(
                     kwargs,
                     detected_device,
                     self._runtime_inspector,
+                    self._zero_stage3_param_map,
                 )
 
                 # Enable sparsity-based optimization when applicable.
@@ -587,6 +618,21 @@ def _enable_conditional_optimizations(
         if self._runtime_options.print_memory_stat:
             self._runtime_inspector.enable_memory_inspector(self._original_module)
 
+    def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device):
+        from ._zero_stage3_compatibility import (
+            STAGE3_PULL_WEIGHT_TRIGGER_NAME,
+            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+        )
+
+        kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
+            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+            dtype=onnx_dtype_to_pytorch(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
+            device=device,
+        ).requires_grad_()
+
+        return kwargs
+
     def _log_feature_stats(self):
         if get_rank() != 0:
             return
diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
index b7c01a1f5baf9..8d8be81c549d1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py
@@ -159,6 +159,9 @@ def forward(self, *inputs, **kwargs):
                 # Assert that the input and model device match
                 _utils._check_same_device(self._device, "Input argument to forward", *inputs)
 
+            if self._runtime_options.enable_zero_stage3_support:
+                self._append_pull_weight_trigger_as_input(kwargs, self._device)
+
             prepared_input_list, _, _ = _io._combine_input_buffers_initializers(
                 self._graph_initializers,
                 self._graph_info.user_input_names,
@@ -168,6 +171,7 @@ def forward(self, *inputs, **kwargs):
                 kwargs,
                 self._device,
                 self._runtime_inspector,
+                self._zero_stage3_param_map,
             )
 
             user_outputs, _ = InferenceManager.execution_session_run_forward(
diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py
index 18b965c549645..e7c1b30daae0d 100644
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@@ -168,6 +168,7 @@ def _combine_input_buffers_initializers(
     kwargs: Mapping[str, ORTModelInputOutputType],
     device: torch.device,
     rt_inspector: RuntimeInspector,
+    zero_stage3_offload_param_map: Optional[Dict[str, torch.nn.parameter.Parameter]],
 ):
     """Creates forward `*inputs` list from user input and PyTorch initializers
 
@@ -254,7 +255,12 @@ def _expand_inputs(current_input, non_none_inputs, name=""):
             )
 
     # params is a list of all initializers known to the onnx graph
-    result.extend(params)
+    if zero_stage3_offload_param_map:
+        for p in params:
+            if p not in zero_stage3_offload_param_map.values():
+                result.append(p)
+    else:
+        result.extend(params)
 
     return result, embed_sparsity_results, label_sparsity_results
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 3be4c05797978..19effe2086e0a 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -311,6 +311,9 @@ def forward(self, *inputs, **kwargs):
 
             self._gradient_accumulation_manager.maybe_update_cache_before_run()
 
+            if self._runtime_options.enable_zero_stage3_support:
+                self._append_pull_weight_trigger_as_input(kwargs, self._device)
+
             prepared_input_list, _, _ = _io._combine_input_buffers_initializers(
                 self._graph_initializers,
                 self._graph_info.user_input_names,
@@ -320,6 +323,7 @@ def forward(self, *inputs, **kwargs):
                 kwargs,
                 self._device,
                 self._runtime_inspector,
+                self._zero_stage3_param_map,
             )
 
             outputs = unflatten_user_output(
diff --git a/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py
new file mode 100644
index 0000000000000..17756600d601e
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py
@@ -0,0 +1,312 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, helper
+
+from onnxruntime.capi._pybind_state import register_torch_autograd_function
+from onnxruntime.training.utils import pytorch_dtype_to_onnx
+
+from ._custom_autograd_function_exporter import PythonOpShapeInferStore
+from ._utils import get_fully_qualified_class_name
+
+STAGE3_PULL_WEIGHT_TRIGGER_NAME = "pull_weight_trigger"
+STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE = TensorProto.FLOAT
+STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE = [1]
+
+
+def post_processing_enable_zero_stage3_compat(
+    exported_model: ModelProto,
+    zero_stage3_named_params: Dict[str, torch.nn.parameter.Parameter],
+    all_param_names: List[str],
+) -> ModelProto:
+    """This function is used to enable zero stage3 compatibility.
+
+    Args:
+        exported_model (ModelProto): The exported model.
+        zero_stage3_named_params (Optional[Dict[str, torch.nn.parameter.Parameter]]): The offload named parameters.
+        all_param_names (List[str]): All parameter names.
+    """
+
+    # Register symbolic shape inference functions for PythonOp used in DeepSpeed ZeRO stage3.
+    _register_symbolic_shape_infer_functions()
+
+    # Create weight retrieving function using zero_stage3_named_params.
+    func_full_qual_name = _create_weight_retrieval_function(zero_stage3_named_params)
+
+    consumer_map = {}
+    for node in exported_model.graph.node:
+        for inp in node.input:
+            if inp not in consumer_map:
+                consumer_map[inp] = []
+
+            if node not in consumer_map[inp]:
+                consumer_map[inp].append(node)
+
+    def _get_param_pull_trigger_name(param_name: str) -> str:
+        return f"pull_{param_name}"
+
+    def _get_func_name(node: NodeProto) -> Optional[str]:
+        for attr in node.attribute:
+            if attr.name == "func_name":
+                return attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+        return None
+
+    # Create weight retrieving PythonOp.
+    new_input, weight_pull_node = _create_weight_retrieval_pythonop(
+        zero_stage3_named_params,
+        func_full_qual_name,
+        STAGE3_PULL_WEIGHT_TRIGGER_NAME,
+        [_get_param_pull_trigger_name(pname) for pname in zero_stage3_named_params],
+        STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+        STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+    )
+
+    from onnxruntime.training.utils.hooks._zero_offload_subscriber import ORTZeROOffloadPreForwardFunction
+
+    prefowrad_function_name = get_fully_qualified_class_name(ORTZeROOffloadPreForwardFunction)
+
+    # Connect weight consumers to use the full-sized parameter output of ORTZeROOffloadPreForwardFunction.
+    for graph_input in exported_model.graph.input:
+        if graph_input.name not in zero_stage3_named_params:
+            continue
+
+        if graph_input.name not in consumer_map:
+            continue
+
+        consumers = consumer_map[graph_input.name]
+        pre_forward_pythonop_node = None
+
+        for c in consumers:
+            if c.op_type != "PythonOp":
+                continue
+
+            func_name = _get_func_name(c)
+            if func_name == prefowrad_function_name:
+                assert (
+                    pre_forward_pythonop_node is None
+                ), "Multiple ORTZeROOffloadPreForwardFunction nodes found, it should not happen"
+                pre_forward_pythonop_node = c
+
+        if pre_forward_pythonop_node is None:
+            raise RuntimeError(
+                "Fail to find ORTZeROOffloadPreForwardFunction for partitioned param: " + graph_input.name
+            )
+
+        index_offset_on_python_op_input = []
+        for i, input_name in enumerate(pre_forward_pythonop_node.input):
+            if input_name == graph_input.name:
+                index_offset_on_python_op_input.append(i)
+
+        assert (
+            len(index_offset_on_python_op_input) == 1
+        ), f"index_offset_on_python_op_input length is not 1: {index_offset_on_python_op_input}"
+
+        reverse_index_among_inputs = index_offset_on_python_op_input[0] - len(pre_forward_pythonop_node.input)
+        new_input_name = _get_param_pull_trigger_name(graph_input.name)
+        pre_forward_pythonop_node.input[index_offset_on_python_op_input[0]] = new_input_name
+
+        _update_python_op_input_related_attributes(
+            pre_forward_pythonop_node,
+            new_input_name,
+            len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE),  # new rank
+            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,  # new data type
+        )
+
+        output_index = reverse_index_among_inputs + len(pre_forward_pythonop_node.output)
+        pre_forward_pythonop_node.output[output_index] = graph_input.name
+
+        # If the consumer of original `graph_input.name` is PythonOp, we need also update its attributes because now
+        # `graph_input.name` as output of pre_forward_pythonop_node, is full-sized parameter, the rank might differ
+        # from the original one.
+        for c in consumers:
+            if c == pre_forward_pythonop_node or c.op_type != "PythonOp":
+                continue
+            _update_python_op_input_related_attributes(
+                c,
+                graph_input.name,
+                len(zero_stage3_named_params[graph_input.name].ds_shape),  # new rank
+                pytorch_dtype_to_onnx(zero_stage3_named_params[graph_input.name].dtype),  # new data type
+            )
+
+    # Delete exported_model.graph.input
+    graph_inputs_to_remove = [
+        graph_input for graph_input in exported_model.graph.input if graph_input.name in zero_stage3_named_params
+    ]
+    for input_to_remove in graph_inputs_to_remove:
+        exported_model.graph.input.remove(input_to_remove)
+
+    # Re-order graph input to make sure the weight pull trigger is before all parameter inputs.
+    offset = 0
+    for graph_input in exported_model.graph.input:
+        if graph_input.name in all_param_names:
+            break
+        offset += 1
+
+    exported_model.graph.input.insert(offset, new_input)
+    exported_model.graph.node.insert(0, weight_pull_node)
+
+    return exported_model
+
+
+def _create_weight_retrieval_function(
+    zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]]
+) -> str:
+    """This function is used to create a weight retrieving function using zero_stage3_named_params."""
+
+    class WeightRetrievalFunction(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, weight_in_trigger):
+            params = list(zero_stage3_named_params.values())
+            ctx.params = params
+            ctx.dtype = weight_in_trigger.dtype
+            ctx.device = weight_in_trigger.device
+            ctx.shape = weight_in_trigger.shape
+            return (torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype),) * len(params)
+
+        @staticmethod
+        def backward(ctx, *grad_outputs):
+            return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype)
+
+        @staticmethod
+        def infer_shape(
+            node: NodeProto,
+            tensor_input_shapes: List[Optional[List[Union[int, str]]]],
+            tensor_input_dtypes: List[torch.onnx.TensorProtoDataType],
+        ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]:
+            param_count = len(zero_stage3_named_params.values())
+            tensor_output_shapes = [
+                tensor_input_shapes[0],
+            ] * param_count
+            tensor_output_dtypes = [
+                tensor_input_dtypes[0],
+            ] * param_count
+            return tensor_output_shapes, tensor_output_dtypes
+
+    func_full_qual_name = get_fully_qualified_class_name(WeightRetrievalFunction)
+    register_torch_autograd_function(func_full_qual_name, WeightRetrievalFunction)
+    PythonOpShapeInferStore.register(WeightRetrievalFunction)
+
+    return func_full_qual_name
+
+
+def _register_symbolic_shape_infer_functions():
+    """This function is used to register symbolic shape inference functions for PythonOp used in
+    DeepSpeed ZeRO stage3."""
+
+    def _simple_pass_through_infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: List[Optional[List[Union[int, str]]]],
+        tensor_input_dtypes: List[torch.onnx.TensorProtoDataType],
+    ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]:
+        return tensor_input_shapes, tensor_input_dtypes
+
+    PythonOpShapeInferStore.register_func(
+        "deepspeed.runtime.zero.parameter_offload.PreBackwardFunction", _simple_pass_through_infer_shape
+    )
+    PythonOpShapeInferStore.register_func(
+        "deepspeed.runtime.zero.parameter_offload.PostBackwardFunction", _simple_pass_through_infer_shape
+    )
+
+    def _linear_infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: List[Optional[List[Union[int, str]]]],
+        tensor_input_dtypes: List[torch.onnx.TensorProtoDataType],
+    ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]:
+        # output = input.matmul(weight.t())
+        tensor_input_shapes[0]  # input
+        shape2 = tensor_input_shapes[1]  # weight
+        output_shape = tensor_input_shapes[0]
+        output_shape[-1] = shape2[-2]
+        return [output_shape], [tensor_input_dtypes[0]]
+
+    PythonOpShapeInferStore.register_func(
+        "deepspeed.runtime.zero.linear.LinearFunctionForZeroStage3", _linear_infer_shape
+    )
+
+
+def _create_weight_retrieval_pythonop(
+    zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]],
+    func_full_qual_name: str,
+    input_name: str,
+    output_names: List[str],
+    STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+    STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE: List[int],
+) -> Tuple[ValueInfoProto, NodeProto]:
+    """This function is used to create a weight retrieving PythonOp."""
+    offload_param_count = 0 if zero_stage3_named_params is None else len(zero_stage3_named_params)
+    new_input = helper.make_tensor_value_info(
+        input_name, STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE, STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE
+    )
+    output_rank_for_pull_weight_trigger = len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE)
+    output_dtype_for_pull_weight_trigger = STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE
+    output_tensor_ranks = [
+        output_rank_for_pull_weight_trigger,
+    ] * offload_param_count
+    output_tensor_types = [
+        output_dtype_for_pull_weight_trigger,
+    ] * offload_param_count
+
+    node_attributes = {
+        "comment": "",
+        "inplace": 0,
+        "input_convention": "d",
+        "input_tensor_ranks": [len(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE)],
+        "input_tensor_types": [STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE],
+        "output_tensor_ranks": output_tensor_ranks,
+        "output_tensor_types": output_tensor_types,
+        "training_mode": 1,
+        "func_name": func_full_qual_name,
+    }
+
+    weight_pull_node = helper.make_node(
+        "PythonOp",
+        [input_name],
+        ["pull_weight_trigger_ctx", *output_names],
+        "pull_weight_trigger",  # node name
+        "PythonOp for weight retrieving.",
+        "com.microsoft",
+        **node_attributes,
+    )
+
+    return new_input, weight_pull_node
+
+
+def _update_python_op_input_related_attributes(node: NodeProto, input_name: str, new_rank: int, new_dtype: int):
+    """This function is used to update PythonOp's input related attributes, e.g.
+        input_tensor_ranks and input_tensor_types.
+
+    Args:
+        node (NodeProto): The PythonOp node.
+        input_name (str): The input name to be updated.
+        new_rank (int): The new rank of the input, to be used in input_tensor_ranks.
+        new_dtype (int): The new data type of the input, to be used in input_tensor_types.
+    """
+    input_tensor_ranks = None
+    input_tensor_dtypes = None
+    rank_attr = None
+    dtype_attr = None
+    for attr in node.attribute:
+        if attr.name == "input_tensor_ranks":
+            input_tensor_ranks = attr.ints
+            rank_attr = attr
+        if attr.name == "input_tensor_types":
+            input_tensor_dtypes = attr.ints
+            dtype_attr = attr
+
+    assert input_tensor_ranks is not None, "input_tensor_ranks is None"
+    assert input_tensor_dtypes is not None, "input_tensor_dtypes is None"
+
+    for index, node_input_name in enumerate(node.input):
+        if node_input_name == input_name:
+            input_tensor_ranks[index] = new_rank
+            input_tensor_dtypes[index] = new_dtype
+
+    node.attribute.remove(rank_attr)
+    node.attribute.remove(dtype_attr)
+    node.attribute.append(helper.make_attribute("input_tensor_ranks", input_tensor_ranks))
+    node.attribute.append(helper.make_attribute("input_tensor_types", input_tensor_dtypes))
diff --git a/orttraining/orttraining/python/training/utils/__init__.py b/orttraining/orttraining/python/training/utils/__init__.py
index acf2698d55eaf..fa7c9f2750cdd 100644
--- a/orttraining/orttraining/python/training/utils/__init__.py
+++ b/orttraining/orttraining/python/training/utils/__init__.py
@@ -9,7 +9,7 @@
     extract_data_and_schema,
     unflatten_data_using_schema,
 )
-from onnxruntime.training.utils.torch_type_map import pytorch_dtype_to_onnx
+from onnxruntime.training.utils.torch_type_map import onnx_dtype_to_pytorch, pytorch_dtype_to_onnx
 
 __all__ = [
     "PrimitiveType",
@@ -18,4 +18,5 @@
     "extract_data_and_schema",
     "unflatten_data_using_schema",
     "pytorch_dtype_to_onnx",
+    "onnx_dtype_to_pytorch",
 ]
diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
index 6c8027b2fefaa..db1c69cf95ba4 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
@@ -6,6 +6,7 @@
 import os
 import shutil
 import warnings
+from io import TextIOWrapper
 from pathlib import Path
 from typing import List, Optional, Tuple, Union
 
@@ -178,87 +179,97 @@ def _summarize_activations(self, tensor: torch.Tensor, depth: int, name: str, st
         order_file_path = step_path / "order.txt"
         tensor_file_path = step_path / output_file_name
 
-        # This is to try the best effort to align the count of numbers per line for easier comparison in diff views,
-        # though it does not always guarantee to do this way.
-        torch.set_printoptions(precision=6, linewidth=128)
-
-        tensor_shape = tensor.shape
-        tensor_dtype = tensor.dtype
-        flatten_array = tensor.flatten().view(-1)
-
-        if self._run_on_cpu:
-            flatten_array = flatten_array.to("cpu")
-
-        if self._run_on_cpu:
-            num_nan = torch.isnan(flatten_array).sum()
-            num_inf = torch.isinf(flatten_array).sum()
-            num_neg = (flatten_array < 0).sum()
-            num_pos = (flatten_array > 0).sum()
-            num_zero = (flatten_array == 0).sum()
-            min_value = flatten_array.min()
-            max_value = flatten_array.max()
-            mean_value = flatten_array.mean()
-            std_value = flatten_array.std()
-        else:
-            # Split the calculation for each bucket, then do another round of calculation on the bucket results.
-            # This can at the best effort reduce the peak memory impact.
-            bucket_size = self._bucket_size
-            element_count = flatten_array.numel()
-            ceil_bucket_count = (element_count + bucket_size - 1) // (bucket_size)
-            nan_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            inf_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            neg_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            pos_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            zero_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            min_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
-            max_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
-            mean_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
-            std_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
-
-            # Summary for each bucket
-            element_count_per_bucket = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
-            for i in range(ceil_bucket_count):
-                end = min((i + 1) * bucket_size, element_count)
-                bucket = flatten_array[i * bucket_size : end]
-                element_count_per_bucket[i] = bucket.numel()
-
-                nan_buckets[i] = torch.isnan(bucket).sum()
-                inf_buckets[i] = torch.isinf(bucket).sum()
-                neg_buckets[i] = (bucket < 0).sum()
-                pos_buckets[i] = (bucket > 0).sum()
-                zero_buckets[i] = (bucket == 0).sum()
-                min_buckets[i] = bucket.min()
-                max_buckets[i] = bucket.max()
-                mean_buckets[i] = bucket.sum()
-                std_buckets[i] = bucket.std()
-
-            # Reduction across all buckets
-            num_nan = nan_buckets.sum()
-            num_inf = inf_buckets.sum()
-            num_neg = neg_buckets.sum()
-            num_pos = pos_buckets.sum()
-            num_zero = zero_buckets.sum()
-            min_value = min_buckets.min()
-            max_value = max_buckets.max()
-            mean_value = float(mean_buckets.sum()) / float(element_count)
-            # Here we refer to
-            # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
-            # to calculate the combined standard deviation of all buckets.
-            s = (element_count_per_bucket - 1) * (std_buckets**2) + element_count_per_bucket * (
-                (mean_buckets - mean_value) ** 2
-            )
-            std_value = torch.sqrt(s.sum() / (element_count - 1))
-
         with order_file_path.open(mode="a", encoding="utf-8") as f:
             f.write(f"{output_file_name}\n")
 
         with tensor_file_path.open(mode="w", encoding="utf-8") as f:
-            f.write(
-                f"{'>'*max(0, depth) + display_name} shape: {tensor_shape} dtype: {tensor_dtype} size: {flatten_array.size()} \n"
-                f"min: {min_value} max: {max_value}, mean: {mean_value}, "
-                f"std: {std_value} \n"
-                f"nan: {num_nan}, inf: {num_inf}\n"
-            )
-            f.write(f"samples(top 128): {flatten_array[:128]}\n")
-            f.write(f"neg: {num_neg}, pos: {num_pos}, zero: {num_zero},\n")
-            f.write(f"{'='*16}\n")
+            _summarize_tensor(display_name, tensor, f, depth, self._run_on_cpu, self._bucket_size)
+
+
+def _summarize_tensor(
+    display_name: str,
+    tensor: torch.Tensor,
+    f: TextIOWrapper,
+    depth: int = 0,
+    run_on_cpu: bool = False,
+    bucket_size: int = 1024 * 1024 * 1024 // 2,
+):
+    # This is to try the best effort to align the count of numbers per line for easier comparison in diff views,
+    # though it does not always guarantee to do this way.
+    torch.set_printoptions(precision=6, linewidth=128)
+
+    tensor_shape = tensor.shape
+    tensor_dtype = tensor.dtype
+    flatten_array = tensor.flatten().view(-1)
+
+    if run_on_cpu:
+        flatten_array = flatten_array.to("cpu")
+
+    if run_on_cpu:
+        num_nan = torch.isnan(flatten_array).sum()
+        num_inf = torch.isinf(flatten_array).sum()
+        num_neg = (flatten_array < 0).sum()
+        num_pos = (flatten_array > 0).sum()
+        num_zero = (flatten_array == 0).sum()
+        min_value = flatten_array.min()
+        max_value = flatten_array.max()
+        mean_value = flatten_array.mean()
+        std_value = flatten_array.std()
+    else:
+        # Split the calculation for each bucket, then do another round of calculation on the bucket results.
+        # This can at the best effort reduce the peak memory impact.
+        element_count = flatten_array.numel()
+        ceil_bucket_count = (element_count + bucket_size - 1) // (bucket_size)
+        nan_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        inf_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        neg_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        pos_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        zero_buckets = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        min_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
+        max_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
+        mean_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
+        std_buckets = torch.zeros(ceil_bucket_count, dtype=flatten_array.dtype, device=flatten_array.device)
+
+        # Summary for each bucket
+        element_count_per_bucket = torch.zeros(ceil_bucket_count, dtype=torch.int64, device=flatten_array.device)
+        for i in range(ceil_bucket_count):
+            end = min((i + 1) * bucket_size, element_count)
+            bucket = flatten_array[i * bucket_size : end]
+            element_count_per_bucket[i] = bucket.numel()
+
+            nan_buckets[i] = torch.isnan(bucket).sum()
+            inf_buckets[i] = torch.isinf(bucket).sum()
+            neg_buckets[i] = (bucket < 0).sum()
+            pos_buckets[i] = (bucket > 0).sum()
+            zero_buckets[i] = (bucket == 0).sum()
+            min_buckets[i] = bucket.min()
+            max_buckets[i] = bucket.max()
+            mean_buckets[i] = bucket.sum()
+            std_buckets[i] = bucket.std()
+
+        # Reduction across all buckets
+        num_nan = nan_buckets.sum()
+        num_inf = inf_buckets.sum()
+        num_neg = neg_buckets.sum()
+        num_pos = pos_buckets.sum()
+        num_zero = zero_buckets.sum()
+        min_value = min_buckets.min()
+        max_value = max_buckets.max()
+        mean_value = float(mean_buckets.sum()) / float(element_count)
+        # Here we refer to
+        # https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+        # to calculate the combined standard deviation of all buckets.
+        s = (element_count_per_bucket - 1) * (std_buckets**2) + element_count_per_bucket * (
+            (mean_buckets - mean_value) ** 2
+        )
+        std_value = torch.sqrt(s.sum() / (element_count - 1))
+
+    f.write(
+        f"{'>'*max(0, depth) + display_name} shape: {tensor_shape} dtype: {tensor_dtype} size: {flatten_array.size()} \n"
+        f"min: {min_value} max: {max_value}, mean: {mean_value}, "
+        f"std: {std_value} \n"
+        f"nan: {num_nan}, inf: {num_inf}\n"
+    )
+    f.write(f"samples(top 128): {flatten_array[:128]}\n")
+    f.write(f"neg: {num_neg}, pos: {num_pos}, zero: {num_zero},\n")
+    f.write(f"{'='*16}\n")
diff --git a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py
index db38f58d8f324..b2bc64be42fc1 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py
@@ -29,14 +29,6 @@ def no_increase_global_step():
     finally:
         ORT_NO_INCREASE_GLOBAL_STEP[0] = False
 
-    @staticmethod
-    def infer_shape(
-        node: onnx.NodeProto,
-        tensor_input_shapes: List[Optional[List[Union[int, str]]]],
-        tensor_input_dtypes: List[torch.onnx.TensorProtoDataType],
-    ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]:
-        return tensor_input_shapes, tensor_input_dtypes
-
 
 class _IncrementStep(torch.autograd.Function):
     """This class is used to manage the global execution step, e.g.
@@ -55,8 +47,9 @@ def forward(ctx, run_ctx: RuntimeStates, *input_tensor_list: Tuple[torch.Tensor,
         ctx.current_step = run_ctx.global_states.execution_step
         ctx.run_ctx = run_ctx
 
-        if ctx.current_step >= 0:
-            print(f"{'='*6} Completed forward pass for STEP {ctx.current_step} {'='*6}")
+        # Uncomment the following line for debugging purposes.
+        # if ctx.current_step >= 0:
+        #     print(f"{'='*6} Completed forward pass for STEP {ctx.current_step} {'='*6}")
 
         if ORT_NO_INCREASE_GLOBAL_STEP[0] is False:
             ctx.run_ctx.global_states.execution_step += 1
@@ -191,7 +184,7 @@ def _reset_recursively(module: torch.nn.Module, depth: int, next_module_index: L
                 next_module_index: list of int, carrying a global unique module index that can be used next.
             """
             module_index = next_module_index[0]
-            module.id = module_index  # STAGE3WARN: needed by DeepSpeed
+            module.id = module_index  # STAGE3WARN#1: needed by DeepSpeed
             self._run_ctx.global_states.module_index_to_depth[module_index] = depth
             self._run_ctx.global_states.module_to_module_index[module] = module_index
 
@@ -217,7 +210,7 @@ def _register_hooks_recursively(self, module: torch.nn.Module, depth: int, next_
             next_module_index: list of int, carrying a global unique module index that can be used next.
         """
         module_index = next_module_index[0]
-        module.id = module_index  # STAGE3WARN: needed by DeepSpeed
+        module.id = module_index  # STAGE3WARN#2: needed by DeepSpeed
         self._run_ctx.global_states.module_index_to_depth[module_index] = depth
         self._run_ctx.global_states.module_to_module_index[module] = module_index
 
diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
index 3d42e172eea82..ad1297962db71 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
@@ -23,25 +23,37 @@
 from ._subscriber_base import RuntimeStates, SubscriberBase
 
 
-# Used to monkey patch the original function
-# Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/parameter_offload.py#L333
-def _setup_zero_stage3_ort_compatible_hooks(self):
-    self.hierarchy = 0
+def _get_ort_compatible_zero_stage3_hook_function(debug, stats_output_dir, stats_overwrite):
+    """Create ort compatible hook function for DeepSpeed ZeRO stage3.
 
-    from onnxruntime.training.utils.hooks import SubscriberManager, ZeROOffloadSubscriber
-    from onnxruntime.training.utils.hooks._zero_offload_subscriber import _zero_offload_one_time_initializer
+    Args:
+        debug: whether to enable convergence debugging.
+        stats_output_dir: the directory to store convergence stats.
+        stats_overwrite: whether to overwrite the stats file if it already exists.
+    """
+
+    # Used to monkey patch the original function
+    # Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/parameter_offload.py#L333
+    def _setup_zero_stage3_ort_compatible_hooks(self):
+        self.hierarchy = 0
+
+        from onnxruntime.training.utils.hooks import StatisticsSubscriber, SubscriberManager, ZeROOffloadSubscriber
+        from onnxruntime.training.utils.hooks._zero_offload_subscriber import _zero_offload_one_time_initializer
 
-    # Each DeepSpeed engine has a separate subscriber manager.
-    self._offload_subscriber_manager = SubscriberManager()
-    self._offload_subscriber_manager.subscribe(
-        self.module, [ZeROOffloadSubscriber(self, _zero_offload_one_time_initializer)]
-    )
-    self.forward_hooks.extend(self._offload_subscriber_manager._pre_forward_hooks)
-    self.forward_hooks.extend(self._offload_subscriber_manager._post_forward_hooks)
+        subscribers = [ZeROOffloadSubscriber(self, _zero_offload_one_time_initializer)]
+        if debug is True:
+            subscribers.append(StatisticsSubscriber(output_dir=stats_output_dir, override_output_dir=stats_overwrite))
+        # Each DeepSpeed engine has a separate subscriber manager.
+        self._offload_subscriber_manager = SubscriberManager()
+        self._offload_subscriber_manager.subscribe(self.module, subscribers)
+        self.forward_hooks.extend(self._offload_subscriber_manager._pre_forward_hooks)
+        self.forward_hooks.extend(self._offload_subscriber_manager._post_forward_hooks)
 
-    # Add top module to stack trace
-    global FWD_MODULE_STACK  # noqa: PLW0602
-    FWD_MODULE_STACK.append(self.module)
+        # Add top module to stack trace
+        global FWD_MODULE_STACK  # noqa: PLW0602
+        FWD_MODULE_STACK.append(self.module)
+
+    return _setup_zero_stage3_ort_compatible_hooks
 
 
 # Adapted from https://github.com/microsoft/DeepSpeed/blob/e8318634b4313eaad89842cf4322e1762d34ced3/deepspeed/runtime/zero/linear.py#L104
@@ -86,14 +98,16 @@ def collect_code(self, function: Callable):
         _zero_offload_one_time_initializer.collect_code(DeepSpeedZeRoOffload.setup_zero_stage3_hooks)
 
     # This is the function to enable ORT ZeRO offload.
-    def configure_ort_compatible_zero_stage3():
+    def configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="./", stats_overwrite=False):
         """Configure ZeRO stage3 to be ORT compatible.
 
         This function will overwrite the original DeepSpeed ZeRO stage3 hooks to make it ORT compatible.
         """
 
         # Only done once no matter how many times this function is called for different modules.
-        DeepSpeedZeRoOffload.setup_zero_stage3_hooks = _setup_zero_stage3_ort_compatible_hooks
+        DeepSpeedZeRoOffload.setup_zero_stage3_hooks = _get_ort_compatible_zero_stage3_hook_function(
+            debug, stats_output_dir, stats_overwrite
+        )
 
         from deepspeed.runtime.zero.linear import zero3_linear_wrap
 
@@ -103,7 +117,7 @@ def configure_ort_compatible_zero_stage3():
 except ImportError as e:
     warnings.warn(f"DeepSpeed import error {e}")
 
-    def configure_ort_compatible_zero_stage3():
+    def configure_ort_compatible_zero_stage3(debug=False, stats_output_dir=None, stats_overwrite=False):
         raise RuntimeError("DeepSpeed is not installed, cannot configure ORT compatible ZeRO stage3.")
 
 
@@ -115,13 +129,13 @@ def _get_params_for_current_module(module: torch.nn.Module) -> List[torch.nn.par
     """
     from deepspeed.runtime.zero.partitioned_param_coordinator import iter_params
 
-    # Retrive the parameters that are not available for this module.
+    # Retrieve all parameters for this module.
     partitioned_params = [param for param in iter_params(module)]
 
     return partitioned_params
 
 
-def _get_all_offloaded_params(module: torch.nn.Module) -> Dict[str, torch.nn.parameter.Parameter]:
+def _get_all_zero_stage3_params(module: torch.nn.Module) -> Dict[str, torch.nn.parameter.Parameter]:
     """Retrieve all the parameters that are offloaded."""
     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
@@ -134,16 +148,13 @@ def _get_all_offloaded_params(module: torch.nn.Module) -> Dict[str, torch.nn.par
 
 
 class ORTZeROOffloadPreForwardFunction(torch.autograd.Function):
-    """This function is a common bridge to call original PyTorch's
-    pre_forward_function and post_backward_function.
-    """
+    """This function is a common bridge to call original PyTorch's pre_forward_function"""
 
     @staticmethod
     def forward(
         ctx,
         module,
         pre_forward_with_kwargs_function,
-        post_backward_function,
         args_schema,
         kwargs_schema,
         args_tensor_count,
@@ -155,7 +166,6 @@ def forward(
             ctx: context object
             module: the module to be called
             pre_forward_with_kwargs_function: the function to be called before forward (PyTorch's pre_forward_function)
-            post_backward_function: the function to be called after backward (PyTorch's post_backward_function)
             args_schema: the schema of the args, used to reconstruct the args in original form in
                 PyTorch's pre_forward_function's inputs.
             kwargs_schema: the schema of the kwargs, used to reconstruct the kwargs in original form in
@@ -168,6 +178,17 @@ def forward(
         args_tensors = tensor_list[:args_tensor_count]
         kwargs_tensors = tensor_list[args_tensor_count : args_tensor_count + kwargs_tensor_count]
 
+        # For PyTorch runs, the sizes are all 0, it does not need a gradient because
+        # param._detach().requires_grad_(False) is called.
+        # But for ORT runs, the sizes are all [1], as output of weight retrieval function.
+        # So we keep track of the shapes and dtypes of the passed-in tensors, then generate the grads in backward.
+        # While for both PyTorch and ORT runs, the grad is not important because they are not param grads
+        # anymore, they are only used for completing the full backward propagation.
+        passed_in_param_tensors = tensor_list[args_tensor_count + kwargs_tensor_count :]
+        ctx.shapes = [p.shape for p in passed_in_param_tensors]
+        ctx.dtypes = [p.dtype for p in passed_in_param_tensors]
+        ctx.devices = [p.device for p in passed_in_param_tensors]
+
         args = unflatten_data_using_schema(args_tensors, args_schema)
         kwargs = unflatten_data_using_schema(kwargs_tensors, kwargs_schema)
 
@@ -179,6 +200,8 @@ def forward(
         partitioned_params = _get_params_for_current_module(module)
         ctx.partitioned_params = partitioned_params
 
+        assert len(partitioned_params) == len(passed_in_param_tensors)
+
         f_ret = pre_forward_with_kwargs_function(module, args, kwargs)
 
         if f_ret is None:
@@ -188,7 +211,6 @@ def forward(
             updated_args, updated_kwargs = f_ret
 
         ctx.module = module
-        ctx.post_backward_function = post_backward_function
 
         updated_args_tensors, _ = extract_data_and_schema(updated_args)
         updated_kwargs_tensors, _ = extract_data_and_schema(updated_kwargs)
@@ -203,17 +225,32 @@ def forward(
     @staticmethod
     def backward(ctx, *grads):
         updated_grads = grads
-        if ctx.post_backward_function is not None:
-            ret = ctx.post_backward_function(ctx.module, grads)
-            if ret is not None:
-                updated_grads = ret
 
-        # TODO(pengwa) Update grad for partitioned parameters.
         input_count = len(updated_grads) - len(ctx.partitioned_params)
-        zeros = [torch.zeros(0, dtype=p.dtype, device=p.device) for p in ctx.partitioned_params]
-        zero_grads = updated_grads[:input_count] + tuple(zeros)
-
-        return (None, None, None, None, None, None, None, *zero_grads)
+        param_start_offset = input_count
+
+        # Only need to accumulate grad explicitly for ORT run (e.g. ctx.shapes[0] == (1,));
+        # In the PyTorch run, the accumulation happens automatically.
+        need_manual_grad_acc = len(ctx.shapes) > 0 and ctx.shapes[0] == (1,)
+        if need_manual_grad_acc:
+            for param_index, p in enumerate(ctx.partitioned_params):
+                g = updated_grads[param_index + param_start_offset]
+                if g is None:
+                    raise RuntimeError(f"param {p} has no grad, this should not happen.")
+                # Param gradient accumulation is triggered here, along with the attached hooks, done by PyTorch.
+                assert p.shape == g.shape, f"param_index: {param_index} - param shape {p.shape} != grad shape {g.shape}"
+                p.backward(g)
+
+        # At this point, the **real** param grads are already updated, the following grads are only used for
+        # completing the full backward propagation, will not affect parameter updates.
+        passed_in_param_grad = [
+            torch.zeros(shape, dtype=dtype, device=device)
+            for shape, dtype, device in zip(ctx.shapes, ctx.dtypes, ctx.devices)
+        ]
+
+        zero_grads = updated_grads[:input_count] + tuple(passed_in_param_grad)
+
+        return (None, None, None, None, None, None, *zero_grads)
 
     @staticmethod
     def infer_shape(
@@ -258,14 +295,14 @@ def forward(
             module: the module to be called
             post_forward_function: the function to be called after forward (PyTorch's post_forward_function)
             pre_backward_function: the function to be called before backward (PyTorch's pre_backward_function)
-            output_schema: the schema of the output, used to reconstruct the output in original form in
+            output_schema: the schema of the output, used to reconstruct the output in its original form in
                 PyTorch's post_forward_function's inputs.
             output_tensors: the list of tensors.
 
         """
         outputs = unflatten_data_using_schema(output_tensors, output_schema)
 
-        # STAGE3WARN: _post_forward_module_hook's second argument `input is not used, so we just pass a None here.
+        # STAGE3WARN#3: _post_forward_module_hook's second argument `input is not used, so we just pass a None here.
         updated_outputs = post_forward_function(module, None, outputs)
 
         if updated_outputs is None:
@@ -341,11 +378,19 @@ def pre_forward_module_apply_impl(
         input and output for torch.autograd.Function, so we do flatten and unflatten here.
 
         """
+        ## Handle `_post_backward_module_hook`
 
-        args_tensors, args_schema = extract_data_and_schema(args)
-        kwargs_tensors, kwargs_schema = extract_data_and_schema(kwargs)
+        # Put `_post_backward_module_hook` first because in backward, it is responsible for unloading parameters,
+        # we want ORTZeROOffloadPreForwardFunction's backward still be able to access the full sized parameters.
+        _post_backward_module_hook = self._functions.get("_post_backward_module_hook")
+        # STAGE3WARN#4: most logic in _post_backward_module_hook can be traced correctly so we don't need to
+        # wrap with PythonOp. For those cannot be traced, we handle them in STAGE3WARN#5.
+        updated_args = _post_backward_module_hook(module, args)
 
-        partitioned_params = _get_params_for_current_module(module)
+        ## Handle `_pre_forward_module_hook`
+
+        args_tensors, args_schema = extract_data_and_schema(updated_args)
+        kwargs_tensors, kwargs_schema = extract_data_and_schema(kwargs)
 
         _pre_forward_module_hook = self._functions.get("_pre_forward_module_hook")
 
@@ -358,18 +403,29 @@ def _wrap_pre_forward_module_hook(module, args, kwargs):
             if rets is not None:
                 updated_args = rets
 
-            # STAGE3WARN: Moved from _post_backward_module_hook to make sure ORT run will trigger every iteration.
+            # STAGE3WARN#5: Moved from _post_backward_module_hook to make sure ORT run will trigger every iteration.
             module.ds_grads_remaining = 0
+
             return updated_args, updated_kwargs
 
-        all_tensors = args_tensors + kwargs_tensors + partitioned_params
+        # Need to pass the parameters as input to let the exporter trace the related weights for
+        # current ORTZeROOffloadPreForwardFunction
+        partitioned_params = _get_params_for_current_module(module)
+        # Don't require grad for passed-in parameter, otherwise it will be treated as a leaf node, in backward
+        # returned 0-sized grad did not match the param's gradient accumulator function's input shape metadata,
+        # PyTorch run will fail during backward.
+        # This will not harm parameter gradient build either in ORT or PyTorch, imagine the weights are used by
+        # computation anyway, so the gradient will be built. This hook only references the parameter, but won't
+        # generate a gradient path for it.
+        detached_partitioned_params = [p.detach().requires_grad_(False) for p in partitioned_params]
+
+        all_tensors = args_tensors + kwargs_tensors + detached_partitioned_params
 
         self._check_all_tensor(all_tensors, module, "pre_forward_module_apply_impl input check")
 
         rets = ORTZeROOffloadPreForwardFunction.apply(
             module,
             _wrap_pre_forward_module_hook,
-            None,
             args_schema,
             kwargs_schema,
             args_tensor_count,
@@ -385,11 +441,6 @@ def _wrap_pre_forward_module_hook(module, args, kwargs):
         updated_args = unflatten_data_using_schema(updated_args_tensors, args_schema)
         updated_kwargs = unflatten_data_using_schema(updated_kwargs_tensors, kwargs_schema)
 
-        _post_backward_module_hook = self._functions.get("_post_backward_module_hook")
-        # STAGE3WARN: Other part of _post_backward_module_hook can be traced correctly so we don't need to
-        # wrap with PythonOp.
-        updated_args = _post_backward_module_hook(module, updated_args)
-
         return updated_args, updated_kwargs
 
     def post_forward_module_apply_impl(
@@ -411,7 +462,7 @@ def post_forward_module_apply_impl(
         _post_forward_module_hook = self._functions.get("_post_forward_module_hook")
 
         def _wrap_post_forward_module_hook(module, input, outputs):
-            # STAGE3WARN: _post_forward_module_hook applied this for each tensor output, so we do a simple wrap here.
+            # STAGE3WARN#6: _post_forward_module_hook applied this for each tensor output, so we do a simple wrap here.
             from deepspeed.runtime.zero.partition_parameters import is_zero_param
 
             updated_outputs = _post_forward_module_hook(module, input, outputs)
@@ -438,8 +489,8 @@ def _wrap_post_forward_module_hook(module, input, outputs):
         updated_outputs = unflatten_data_using_schema(updated_outputs_tensors, outputs_schema)
 
         _pre_backward_module_hook = self._functions.get("_pre_backward_module_hook")
-        # STAGE3WARN: _pre_backward_module_hook's second argument `input is not used, so we just pass a None here.
-        # STAGE3WARN: part of the original _pre_backward_module_hook can be traced correctly so we moved them into
+        # STAGE3WARN#7: _pre_backward_module_hook's second argument `input is not used, so we just pass a None here.
+        # STAGE3WARN#8: part of the original _pre_backward_module_hook can be traced correctly so we moved them into
         # _wrap_post_forward_module_hook above.
         updated_outputs = _pre_backward_module_hook(module, None, updated_outputs)
 
diff --git a/orttraining/orttraining/python/training/utils/torch_type_map.py b/orttraining/orttraining/python/training/utils/torch_type_map.py
index 699747723f457..bdacab8ad04fe 100644
--- a/orttraining/orttraining/python/training/utils/torch_type_map.py
+++ b/orttraining/orttraining/python/training/utils/torch_type_map.py
@@ -33,6 +33,8 @@
 
 _DTYPE_TO_ONNX = {torch_dtype: onnx_dtype for k, (onnx_dtype, torch_dtype) in _CAST_PYTORCH_TO_ONNX.items()}
 
+_ONNX_TO_DTYPE = {onnx_dtype: torch_dtype for torch_dtype, onnx_dtype in _DTYPE_TO_ONNX.items()}
+
 
 def pytorch_dtype_to_onnx(dtype_or_scalar_type: Union[torch.dtype, str]) -> torch.onnx.TensorProtoDataType:
     """Converts a pytorch dtype or scalar type string to an onnx dtype."""
@@ -45,3 +47,10 @@ def pytorch_dtype_to_onnx(dtype_or_scalar_type: Union[torch.dtype, str]) -> torc
     if dtype not in _DTYPE_TO_ONNX:
         raise RuntimeError(f"Unsupported dtype {dtype}")
     return _DTYPE_TO_ONNX[dtype]
+
+
+def onnx_dtype_to_pytorch(dtype: torch.onnx.TensorProtoDataType) -> torch.dtype:
+    """Converts an onnx dtype to a pytorch dtype."""
+    if dtype not in _ONNX_TO_DTYPE:
+        raise RuntimeError(f"Unsupported dtype {dtype}")
+    return _ONNX_TO_DTYPE[dtype]
diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
index 4e7fcbc95bb1d..e1d4be24861f5 100644
--- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
+++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
@@ -153,8 +153,11 @@ void PythonOpBase::RunForward(OpKernelContext* context,
       inplace_ != 0,
       kernel_invoke_id_);
 
-  ORT_ENFORCE(1 + returned_ortvalues.size() == static_cast<size_t>(context->OutputCount()),
-              "Output count mismatch for PythonOp run");
+  const size_t returned_output_count = 1 + returned_ortvalues.size();
+  const size_t kernel_output_count = static_cast<size_t>(context->OutputCount());
+  ORT_ENFORCE(returned_output_count == kernel_output_count, "Output count mismatch for PythonOp run, ",
+              "returned_output_count: ", returned_output_count, ", expected kernel_output_count: ",
+              kernel_output_count);
 }
 
 void PythonOpBase::SetOutputs(OpKernelContext* context, void* diff_ctx, std::vector<OrtValue>& returned_args) const {

From 1bc215e1d1c1e3509a1dd0bc413b1537563dedb5 Mon Sep 17 00:00:00 2001
From: Yiming Hu <woinck@users.noreply.github.com>
Date: Thu, 21 Sep 2023 19:22:28 -0700
Subject: [PATCH 08/14] [VITISAI] add float16 and bfloat16 support (#17438)

### Description
Add float16 and bfloat16 data type support for VitisAI ep


### Motivation and Context
The VitisAI ep has added the bfloat datatype support. So we would like
to register the datatype from onnxruntime side to enable them.

---------

Signed-off-by: Yiming Hu <yiming.hu@amd.com>
---
 onnxruntime/core/providers/vitisai/README.md               | 2 +-
 onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/README.md b/onnxruntime/core/providers/vitisai/README.md
index 15e0c804489c5..6ddb58b8d96ae 100644
--- a/onnxruntime/core/providers/vitisai/README.md
+++ b/onnxruntime/core/providers/vitisai/README.md
@@ -1,4 +1,4 @@
-VitsAI Execution Prividers
+VitisAI Execution Provider
 ============================
 
 
diff --git a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
index 544e18350635d..ee8dfc6d03d12 100644
--- a/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
+++ b/onnxruntime/core/providers/vitisai/imp/register_xir_ops.cc
@@ -34,9 +34,12 @@ static void xir_shape_infer(ONNX_NAMESPACE::InferenceContext& ctx) {
     updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::INT64);
   } else if (data_type->s() == "int1") {
     updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL);
+  } else if (data_type->s() == "bfloat16") {
+    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BFLOAT16);
+  } else if (data_type->s() == "float16") {
+    updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::FLOAT16);
   } else {
-    std::cerr << "not supported data_type " << data_type->s();
-    abort();
+    vai_assert(false, ", not supported data_type: " + data_type->s());
   }
   if (shape != nullptr) {
     for (auto i = 0; i < shape->ints_size(); ++i) {

From cd3fb377ea867570796cf61bc420cd985129a2a0 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 22 Sep 2023 11:55:08 +0800
Subject: [PATCH 09/14] [js/webgpu] Allow binary ops with scalar to use the
 vectorize path (#17589)

### Description
1. For binary ops, the components is always 4. So the dispatchGroup
should be : `{x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /*
component size */)}` instead of `{x: Math.ceil(outputSize / 64 /*
workgroup size */ / (vectorize ? 4 : 1) /* vec size */)}`.

2. If any of a or b only has one element, we still can use the vectorize
path since the same value will be broadcasted.
---
 js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts | 23 +++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index 13d3a91bb339e..9c05080f7e118 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -62,14 +62,24 @@ const createBinaryOpProgramShader =
       let assignment: string;
       if (vectorize) {
         if (doBroadcast) {
-          assignment = `
+          const isAOneElement = ShapeUtil.size(dimsA) === 1;
+          const isBOneElement = ShapeUtil.size(dimsB) === 1;
+          if (isAOneElement || isBOneElement) {
+            assignment = output.setByOffset(
+                'global_idx',
+                expressionVector(
+                    isAOneElement ? `${a.type.value}(${a.getByOffset('0')}.x)` : a.getByOffset('global_idx'),
+                    isBOneElement ? `${b.type.value}(${b.getByOffset('0')}.x)` : b.getByOffset('global_idx')));
+          } else {
+            assignment = `
             let outputIndices = ${output.offsetToIndices('global_idx * 4u')};
             let offsetA = calcOffsetA(outputIndices);
             let offsetB = calcOffsetB(outputIndices);
             ${
-              output.setByOffset(
-                  'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))}
+                output.setByOffset(
+                    'global_idx', expressionVector(a.getByOffset('offsetA / 4u'), b.getByOffset('offsetB / 4u')))}
           `;
+          }
         } else {
           assignment = output.setByOffset(
               'global_idx', expressionVector(a.getByOffset('global_idx'), b.getByOffset('global_idx')));
@@ -141,6 +151,8 @@ const createBinaryOpProgramInfo =
         }
         outputShape = calculatedShape;
         outputSize = ShapeUtil.size(outputShape);
+        const isAOneElement = ShapeUtil.size(a.dims) === 1;
+        const isBOneElement = ShapeUtil.size(b.dims) === 1;
 
         // check whether vectorize can be enabled
         let sharedDimension = 1;
@@ -153,7 +165,7 @@ const createBinaryOpProgramInfo =
             break;
           }
         }
-        if (sharedDimension % 4 === 0) {
+        if (sharedDimension % 4 === 0 || isAOneElement || isBOneElement) {
           vectorize = true;
         }
       } else {
@@ -167,8 +179,7 @@ const createBinaryOpProgramInfo =
             shaderHelper, a.dims, b.dims, outputShape, vectorize, isBroadcast, funcCall, a.dataType, b.dataType,
             outputDataType, additionalImplementation),
         outputs: [{dims: outputShape, dataType: outputDataType, gpuDataType: GpuDataType.default}],
-        dispatchGroup: () =>
-            ({x: Math.ceil(outputSize / 64 /* workgroup size */ / (vectorize ? 4 : 1) /* vec size */)})
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* component size */)})
       };
     };
 

From 891fba3b9cd71e2e1afdeab9fb3c5b5497db20cf Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 22 Sep 2023 12:00:36 +0800
Subject: [PATCH 10/14] [js/webgpu] Optimize Gather op (#17625)

### Description
This PR optimizes the gather op, which is improved ~6ms in segment
anything model in ADL.
The problem in original algorithm is that it includes a for loop to
calculate a block size of data. However, the block size may be very
large, like `65536`. In GPU shader, we should try to avoid large loop in
shader and try to use more threads to do it parallelly.

Before:
```
[profiling] kernel "41771992|[Gather] 41771992" input[0]: [4,65536] | float32, input[1]: [1] | int64, output[0]: [1,65536] | float32, execution time: 6886207 ns
```
After:
```
[profiling] kernel "41771992|[Gather] 41771992" input[0]: [4,65536] | float32, input[1]: [1] | int64, output[0]: [1,65536] | float32, execution time: 11719 ns
---
 js/web/lib/wasm/jsep/webgpu/ops/common.ts |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts | 91 ++++++++++-------------
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index c054da51a3098..0ab777bfbdee9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -366,7 +366,7 @@ const createIndicesHelper =
 
       const getByIndicesImplementation = rank < 2 ? '' : `
   fn get_${name}ByIndices(indices: ${type.indices}) -> ${valueType} {
-    return ${name}[i2o_${name}(indices)];
+    return ${getByOffset(`i2o_${name}(indices)`)};
   }`;
 
       const getImplementation = rank < 2 ? '' : (() => {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index 0db060dbec54a..47aae13d6799d 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -1,13 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {ShaderHelper} from './common';
+import {inputVariable, outputVariable, ShaderHelper} from './common';
 
 export interface GatherAttributes extends AttributeWithCacheKey {
   axis: number;
@@ -30,63 +29,55 @@ const createGatherProgramInfo =
       const outputShape = inputShape.slice(0);
       outputShape.splice(axis, 1, ...indicesShape);
 
-      const inputDataType = inputs[0].dataType;
-      const block = ShapeUtil.sizeFromDimension(inputShape, axis + 1);
-      const elementSize = [DataType.int64, DataType.uint64, DataType.double].includes(inputDataType) ? 2 : 1;
-      const indicesElementSize = inputs[1].dataType === DataType.int64 ? 2 : 1;
-      const blockSize = elementSize * block;
-      const M = ShapeUtil.sizeToDimension(inputShape, axis);
-      const N = ShapeUtil.size(indicesShape);
-      const dataBatchElements = ShapeUtil.sizeFromDimension(inputShape, axis) * elementSize;
-      const gatheredBatchElements = N * block * elementSize;
       const axisDimLimit = inputShape[axis];
+      const outputSize = ShapeUtil.size(outputShape);
+
+      const data = inputVariable('data', inputs[0].dataType, inputs[0].dims);
+      const indices = inputVariable('inputIndices', inputs[1].dataType, inputs[1].dims);
+      const output = outputVariable('output', inputs[0].dataType, outputShape);
+      const calcDataIndices = (): string => {
+        const indicesRank = indicesShape.length;
+        let calcStr = `var indicesIndices  = ${indices.type.indices}(0);`;
+        for (let i = 0; i < indicesRank; i++) {
+          calcStr += `${indicesRank > 1 ? `indicesIndices[${i}]` : 'indicesIndices'} = ${
+              outputShape.length > 1 ? `outputIndices[${axis + i}]` : 'outputIndices'};`;
+        }
+        calcStr += `
+        var idx = ${indices.getByIndices('indicesIndices')};
+        if (idx < 0) {
+          idx = idx + ${axisDimLimit};
+        }
+        var dataIndices = ${data.type.indices}(0);
+      `;
+        for (let i = 0, j = 0; i < inputRank; i++) {
+          if (i === axis) {
+            calcStr += `${inputRank > 1 ? `dataIndices[${i}]` : 'dataIndices'} = u32(idx);`;
+            j += indicesRank;
+          } else {
+            calcStr += `${inputRank > 1 ? `dataIndices[${i}]` : 'dataIndices'} = ${
+                outputShape.length > 1 ? `outputIndices[${j}]` : 'outputIndices'};`;
+            j++;
+          }
+        }
+        return calcStr;
+      };
 
-      const inputSize = ShapeUtil.size(inputShape) * elementSize;
-      const outputSize = ShapeUtil.size(outputShape) * elementSize;
-
-      const totalGathers = M * N;
-      // int64 indices would be treated as little endian i32 with assumption they fall in i32 limits
-      // That assumption is safe as it's not possible to allocate >2gb buffer for input tensor
-      // Input data will be treated as u32 or two u32 for 8-byte tensors
       const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const N: u32 = ${N};
-  const elementSize: u32 = ${elementSize};
-  const indicesElementSize: u32 = ${indicesElementSize};
-
-  @group(0) @binding(0) var<storage, read> input : array<u32>;
-  @group(0) @binding(1) var<storage, read> inputIndices : array<i32>;
-  @group(0) @binding(2) var<storage, read_write> output: array<u32>;
-
-  ${shaderHelper.mainStart()}
-    let batch: u32 = global_idx / N;
-    let i: u32 = global_idx % N;
-
-    let srcOffsetBatch: u32 = batch * ${dataBatchElements};
-    let dstOffsetBatch: u32 = batch * ${gatheredBatchElements};
-    var idx = inputIndices[i * indicesElementSize];
-    if (idx < 0) {
-        idx = idx + ${axisDimLimit};
-    }
-
-    let srcOffset = srcOffsetBatch + u32(idx) * ${blockSize};
-    let dstOffset = dstOffsetBatch + i * ${blockSize};
-    if (srcOffset >= ${inputSize}) {
-        return;
-    }
-    if (dstOffset >= ${outputSize}) {
-        return;
-    }
-    for (var j: u32 = 0; j < ${blockSize}; j++) {
-        output[dstOffset + j] = input[srcOffset + j];
-    }
-  }`;
+      ${shaderHelper.declareVariables(data, indices, output)}
+      ${shaderHelper.mainStart()}
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+        let outputIndices = ${output.offsetToIndices('global_idx')};
+        ${calcDataIndices()};
+        let value = ${data.getByIndices('dataIndices')};
+        ${output.setByOffset('global_idx', 'value')};
+      }`;
       return {
         ...metadata,
         outputs: [
           {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
         ],
         getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(totalGathers / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: Math.ceil(outputSize / 64 /* workgroup size */)})
       };
     };
 

From 55b16d347cbcde41b35c3ed12f34eeca1a1b05d6 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Sat, 23 Sep 2023 00:50:36 +0800
Subject: [PATCH 11/14] Read model zoo test (#17666)

---
 onnxruntime/test/providers/cpu/model_tests.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index ef2d7e31654ba..9b41ba8c0d2ba 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -1133,11 +1133,15 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
 #if defined(NDEBUG) || defined(RUN_MODELTEST_IN_DEBUG_MODE)
 #ifdef _WIN32
     ORT_STRING_VIEW model_test_root_path = ORT_TSTR("..\\models");
+    // thus, only the root path should be mounted.
+    ORT_STRING_VIEW model_zoo_path = ORT_TSTR("..\\models\\zoo");
 #else
     ORT_STRING_VIEW model_test_root_path = ORT_TSTR("../models");
+    ORT_STRING_VIEW model_zoo_path = ORT_TSTR("../models/zoo");
 #endif
     for (auto p : kvp.second) {
       paths.push_back(ConcatPathComponent(model_test_root_path, p));
+      paths.push_back(ConcatPathComponent(model_zoo_path, p));
     }
 #endif
 

From 6d7bc2a097a1a08541cd0d4628831c79ab8092d5 Mon Sep 17 00:00:00 2001
From: Lukas Berbuer <36054362+lukasberbuer@users.noreply.github.com>
Date: Fri, 22 Sep 2023 18:54:38 +0200
Subject: [PATCH 12/14] Fix ARMv7 build (#13891)

Fix ARMv7 build error on Linux.

### Description

`cpuinfo_*` functions are only available if `CPUINFO_SUPPORTED` set and
therefore `"cpuinfo.h"` included.
Fixed with extended conditional code.

### Motivation and Context
Compilation with ARMv7 on Linux system fails.
---
 onnxruntime/core/common/cpuid_info.cc | 54 +++++++++++++--------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index a23409292bb74..6a82b3fcc734d 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -135,38 +135,34 @@ void CPUIDInfo::ArmLinuxInit() {
     LOGS_DEFAULT(WARNING) << "Failed to init pytorch cpuinfo library, may cause CPU EP performance degradation due to undetected CPU features.";
     return;
   }
+  is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
+  has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
+  has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
+  const uint32_t core_cnt = cpuinfo_get_cores_count();
+  core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
+  is_armv8_narrow_ld_.resize(core_cnt, false);
+  for (uint32_t c = 0; c < core_cnt; c++) {
+    const struct cpuinfo_processor* proc = cpuinfo_get_processor(c);
+    if (proc == nullptr) {
+      continue;
+    }
+    const struct cpuinfo_core* corep = proc->core;
+    if (corep == nullptr) {
+      continue;
+    }
+    auto coreid = proc->linux_id;
+    auto uarch = corep->uarch;
+    core_uarchs_[coreid] = uarch;
+    if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
+        uarch == cpuinfo_uarch_cortex_a55) {
+      is_armv8_narrow_ld_[coreid] = true;
+    }
+  }
 #else
   pytorch_cpuinfo_init_ = false;
+  has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0);
+  has_fp16_ |= has_arm_neon_dot_;
 #endif
-
-  if (pytorch_cpuinfo_init_) {
-    is_hybrid_ = cpuinfo_get_uarchs_count() > 1;
-    has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot();
-    has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
-    const uint32_t core_cnt = cpuinfo_get_cores_count();
-    core_uarchs_.resize(core_cnt, cpuinfo_uarch_unknown);
-    is_armv8_narrow_ld_.resize(core_cnt, false);
-    for (uint32_t c = 0; c < core_cnt; c++) {
-      const struct cpuinfo_processor* proc = cpuinfo_get_processor(c);
-      if (proc == nullptr) {
-        continue;
-      }
-      const struct cpuinfo_core* corep = proc->core;
-      if (corep == nullptr) {
-        continue;
-      }
-      auto coreid = proc->linux_id;
-      auto uarch = corep->uarch;
-      core_uarchs_[coreid] = uarch;
-      if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
-          uarch == cpuinfo_uarch_cortex_a55) {
-        is_armv8_narrow_ld_[coreid] = true;
-      }
-    }
-  } else {
-    has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0);
-    has_fp16_ |= has_arm_neon_dot_;
-  }
 }
 
 #elif defined(_WIN32)

From e70a23f8dc6fc181218106f0e12730f980cc867e Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 22 Sep 2023 10:52:47 -0700
Subject: [PATCH 13/14] [QNN EP] Integrate Resize op fixes from QNN 2.14.1
 (#17641)

### Description
QNN SDK version 2.14.1 fixed several issues with the QNN Resize
operator. This PR integrates the fixes and simplifies the
implementation.

### Motivation and Context
Improve Resize operator and test coverage.
---
 .../builder/opbuilder/resize_op_builder.cc    | 379 ++++++------------
 .../providers/cpu/tensor/resize_op_test.cc    |  38 +-
 onnxruntime/test/providers/qnn/resize_test.cc | 224 ++++++++---
 3 files changed, 308 insertions(+), 333 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index 511f2a5149f2e..4039c4fbf8d70 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 
 #include <array>
-#include <string_view>
+#include <cassert>
+#include <unordered_map>
 
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
@@ -42,76 +43,6 @@ class ResizeOpBuilder : public BaseOpBuilder {
                                      bool do_op_validation) const override ORT_MUST_USE_RESULT;
 
  private:
-  /**
-   * Returns the QNN integer value that corresponds to the given ONNX mode (string).
-   *
-   * /param onnx_modes Array of ONNX modes supported by QNN. The index of each mode corresponds to the QNN value.
-   * /param onnx_mode The ONNX mode for which to get the corresponding QNN value.
-   * /param onnx_model_label Mode label to print out in case of error (e.g., "nearest_mode").
-   * /param qnn_mode Output parameter that is set to the appropriate QNN value from the given ONNX mode.
-   *
-   * /returns A status indicating failure or success.
-   */
-  template <typename QnnValType, std::size_t N>
-  Status GetQnnModeFromString(const std::array<std::string_view, N>& onnx_modes, std::string_view onnx_mode,
-                              const char* onnx_mode_label, QnnValType& qnn_mode) const ORT_MUST_USE_RESULT;
-
-  /**
-   * Called by IsOpSupported to validate the op for non-quantized models.
-   *
-   * /param qnn_model_wrapper The QNN model wrapper instance.
-   * /param node_unit The node unit containing metadata for the ONNX Resize operator.
-   *
-   * /returns A status indicating failure or success.
-   */
-  Status ValidateOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT;
-
-  /**
-   * Called by IsOpSupported to validate the op for quantized models.
-   *
-   * /param qnn_model_wrapper The QNN model wrapper instance.
-   * /param node_unit The node unit containing metadata for the ONNX Resize operator and its Q/DQ nodes.
-   *
-   * /returns A status indicating failure or success.
-   */
-  Status ValidateQDQOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const ORT_MUST_USE_RESULT;
-
-  /**
-   * Called by ProcessAttributesAndOutputs to process the op's attributes and outputs
-   * for non-quantized models.
-   *
-   * /param qnn_model_wrapper The QNN model wrapper instance.
-   * /param node_unit The node unit containing metadata for the ONNX Resize operator.
-   * /param input_names The operator's input names.
-   * /param logger A logger.
-   * /param do_op_validation Set to true if the op should be validated using QNN's validation API.
-   *
-   * /returns A status indicating failure or success.
-   */
-  Status ProcessOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                  const NodeUnit& node_unit,
-                                  std::vector<std::string>&& input_names,
-                                  const logging::Logger& logger,
-                                  bool do_op_validation) const ORT_MUST_USE_RESULT;
-
-  /**
-   * Called by ProcessAttributesAndOutputs to process the op's attributes and outputs
-   * for quantized models.
-   *
-   * /param qnn_model_wrapper The QNN model wrapper instance.
-   * /param node_unit The node unit containing metadata for the ONNX Resize operator and its Q/DQ nodes.
-   * /param input_names The operator's input names.
-   * /param logger A logger.
-   * /param do_op_validation Set to true if the op should be validated using QNN's validation API.
-   *
-   * /returns A status indicating failure or success.
-   */
-  Status ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                     const NodeUnit& node_unit,
-                                     std::vector<std::string>&& input_names,
-                                     const logging::Logger& logger,
-                                     bool do_op_validation) const ORT_MUST_USE_RESULT;
-
   // Info for each ONNX attribute of interest (attribute name + default value)
   static const OnnxAttrInfo<std::string> onnx_mode_attr;
   static const OnnxAttrInfo<std::string> onnx_coord_transf_mode_attr;
@@ -119,21 +50,29 @@ class ResizeOpBuilder : public BaseOpBuilder {
   static const OnnxAttrInfo<int64_t> onnx_antialias_attr;
   static const OnnxAttrInfo<int64_t> onnx_exclude_outside_attr;
 
-  // Arrays of supported QNN modes for QNN's Resize op. The index of each mode is used as the corresponding
-  // QNN parameter value. Ex: The "nearest" mode is represented as the value 0 in QNN. Note, that
-  // not all modes are supported by every QNN backend.
+  // Tables that map an ONNX attribute value (string) to the corresponding integer (enum) QNN parameter value.
+  // Ex: The "half_pixel" coordinate_transformation_mode is represented as the value 0 in QNN.
+  // Only the modes supported by QNN Resize are mapped by these tables.
+  static const std::unordered_map<std::string, uint32_t> supported_modes;
+  static const std::unordered_map<std::string, uint32_t> supported_coord_transf_modes;
+  static const std::unordered_map<std::string, uint32_t> supported_nearest_modes;
+};
 
-  // QNN values: NEAREST = 0, LINEAR = 1
-  static constexpr std::array<std::string_view, 2> supported_modes = {"nearest", "linear"};
+const std::unordered_map<std::string, uint32_t> ResizeOpBuilder::supported_modes = {
+    {"nearest", QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST},
+    {"linear", QNN_OP_RESIZE_INTERPOLATION_MODE_LINEAR}};
 
-  // QNN values: HALF_PIXEL = 0, PYTORCH_HALF_PIXEL = 1, ALIGN_CORNERS = 2, ASYMMETRIC = 3
-  static constexpr std::array<std::string_view, 4> supported_coord_transf_modes = {"half_pixel", "pytorch_half_pixel",
-                                                                                   "align_corners", "asymmetric"};
+const std::unordered_map<std::string, uint32_t> ResizeOpBuilder::supported_coord_transf_modes = {
+    {"half_pixel", QNN_OP_RESIZE_TRANSFORMATION_MODE_HALF_PIXEL},
+    {"pytorch_half_pixel", QNN_OP_RESIZE_TRANSFORMATION_MODE_PYTORCH_HALF_PIXEL},
+    {"align_corners", QNN_OP_RESIZE_TRANSFORMATION_MODE_ALIGN_CORNERS},
+    {"asymmetric", QNN_OP_RESIZE_TRANSFORMATION_MODE_ASYMMETRIC}};
 
-  // QNN values: ROUND_PREFER_FLOOR = 0, ROUND_PREFER_CEIL = 1, FLOOR = 2, CEIL = 3
-  static constexpr std::array<std::string_view, 4> supported_nearest_modes = {"round_prefer_floor", "round_prefer_ceil",
-                                                                              "floor", "ceil"};
-};
+const std::unordered_map<std::string, uint32_t> ResizeOpBuilder::supported_nearest_modes = {
+    {"round_prefer_floor", QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_FLOOR},
+    {"round_prefer_ceil", QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_CEIL},
+    {"floor", QNN_OP_RESIZE_NEAREST_MODE_FLOOR},
+    {"ceil", QNN_OP_RESIZE_NEAREST_MODE_CEIL}};
 
 const OnnxAttrInfo<std::string> ResizeOpBuilder::onnx_mode_attr = {"mode", "nearest"};
 const OnnxAttrInfo<std::string> ResizeOpBuilder::onnx_coord_transf_mode_attr = {"coordinate_transformation_mode",
@@ -143,19 +82,26 @@ const OnnxAttrInfo<std::string> ResizeOpBuilder::onnx_nearest_mode_attr = {"near
 const OnnxAttrInfo<int64_t> ResizeOpBuilder::onnx_antialias_attr = {"antialias", 0};
 const OnnxAttrInfo<int64_t> ResizeOpBuilder::onnx_exclude_outside_attr = {"exclude_outside", 0};
 
-template <typename QnnValType, std::size_t N>
-Status ResizeOpBuilder::GetQnnModeFromString(const std::array<std::string_view, N>& onnx_modes,
-                                             std::string_view onnx_mode, const char* onnx_mode_label,
-                                             QnnValType& qnn_mode) const {
-  for (size_t i = 0; i < onnx_modes.size(); ++i) {
-    if (onnx_modes[i] == onnx_mode) {
-      qnn_mode = SafeInt<QnnValType>(i);
-      return Status::OK();
-    }
+// Returns the QNN parameter integer value that corresponds to the given ONNX attribute mode string value.
+static Status GetQnnModeValFromOnnxString(const std::unordered_map<std::string, uint32_t>& supported_qnn_modes,
+                                          const std::string& onnx_attr_value,
+                                          const char* onnx_attr_name,
+                                          uint32_t& qnn_mode_value) {
+  auto it = supported_qnn_modes.find(onnx_attr_value);
+  if (it != supported_qnn_modes.end()) {
+    qnn_mode_value = it->second;
+    return Status::OK();
   }
 
-  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Resize operator does not support ", onnx_mode_label,
-                         " ", std::string(onnx_mode));
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Resize operator does not support ", onnx_attr_name,
+                         " ", std::string(onnx_attr_value));
+}
+
+// Returns true if the given ONNX attribute mode value is generally supported on QNN. Note that
+// different QNN backends may support a smaller subset of modes.
+static bool IsOnnxAttrModeSupported(const std::unordered_map<std::string, uint32_t>& supported_qnn_modes,
+                                    const std::string& onnx_attr_value) {
+  return supported_qnn_modes.find(onnx_attr_value) != supported_qnn_modes.end();
 }
 
 // Resize ops are sensitive with data layout, no special validation so far
@@ -169,118 +115,95 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
     return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
   }
 
+  const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  NodeAttrHelper node_helper(node_unit);
+
   // QNN doesn't support anti-aliasing (added in opset 18)
   if (node_unit.SinceVersion() >= 18) {
-    NodeAttrHelper node_helper(node_unit);
     const bool antialias = GetOnnxAttr(node_helper, onnx_antialias_attr) != 0;
     ORT_RETURN_IF(antialias, "QNN EP: Resize doesn't support anti-aliasing.");
   }
 
-  // The QNN Resize op does not currently work with the QNN cpu backend, but works with the HTP backend. Therefore, we
-  // currently use QNN's Resize op for quantized models and either ResizeBilinear or ResizeNearestNeighbor for
-  // non-quantized models. This requires separate validation for quantized models.
-  // TODO: Use only Resize once QNN's Resize op works in the QNN cpu backend.
-  bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
-  return is_npu_backend ? ValidateQDQOp(qnn_model_wrapper, node_unit) : ValidateOp(qnn_model_wrapper, node_unit);
-}
-
-Status ResizeOpBuilder::ValidateOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
-  NodeAttrHelper node_helper(node_unit);
-  const std::string resize_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
-  ORT_RETURN_IF((resize_mode != "nearest") && (resize_mode != "linear"),
-                "QNN EP: Resize doesn't support mode '", resize_mode.c_str(), "'.",
-                "Only 'nearest' and 'linear' are supported.");
-
-  const std::string coordinate_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr);
-  ORT_RETURN_IF((coordinate_mode != "half_pixel") && (coordinate_mode != "align_corners"),
-                "QNN EP: coordinate transformation mode '", coordinate_mode.c_str(), "' not supported for Resize op.",
-                "Only 'align_corners' and 'half_pixel' are supported.");
-
-  // Check for a valid "nearest_mode" if the mode is "nearest".
-  if (resize_mode == "nearest") {
-    // NOTE: QNN's ResizeNearestNeighbor operator does not have a way to specify rounding (i.e., "nearest_mode").
-    // The output of the QNN ResizeNearestNeighbor operator is not always equivalent to ONNX's Resize
-    // operator with any single specific "nearest_mode".
-    //
-    // For some input/output shapes, QNN's ResizeNearestNeighbor is equivalent to ONNX's Resize with "round_prefer_floor".
-    // For other shapes, QNN's ResizeNearestNeighbor is equivalent to ONNX Resize with "round_prefer_ceil".
-    //
-    // From unit tests, I've found a relationship between input/output shapes and the equivalent ONNX "nearest_mode".
-    // If the new and old spatial dimensions are evenly divisible, the "nearest_mode" is "round_prefer_floor".
-    // Otherwise, the "nearest_mode" is "round_prefer_ceil".
-    //
-    // This relationship is probably incomplete/wrong.
-    //
-    // TODO: Ask Qualcomm what the correct "nearest_mode" should be,
-    // OR use QNN's own Resize operator once it works on QnnCpu.
-    const std::string& nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr);
-    ORT_RETURN_IF_NOT("floor" == nearest_mode, "QNN Resize only supports nearest_mode: floor!");  // This is wrong!
-  }
-
-  auto& input_0 = node_unit.Inputs()[0];
-  std::vector<uint32_t> input_shape;
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
-                    "QNN EP: Cannot get input shape for Resize op");
-
-  const auto& output_0 = node_unit.Outputs()[0];
-  std::vector<uint32_t> output_shape;
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output_0.node_arg, output_shape),
-                    "QNN EP: Cannot get output shape for Resize op");
-
-  ORT_RETURN_IF(input_shape.size() != 4 || output_shape.size() != 4, "QNN Resize only supports 4D!");
-
-  ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type();
-  ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
-                "QNN EP: Data type ", input_data_type->c_str(),
-                " is not supported for Resize operator in CPU backend.");
-
-  return Status::OK();
-}
-
-Status ResizeOpBuilder::ValidateQDQOp(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
-  NodeAttrHelper node_helper(node_unit);
-
-  using namespace onnxruntime::qnn::utils;
   // Check mode
   const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
-  ORT_RETURN_IF_NOT(ArrayHasString(supported_modes, interp_mode), "QNN EP: Resize does not support mode ",
+  ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_modes, interp_mode), "QNN EP: Resize does not support mode ",
                     interp_mode.c_str());
 
   // Check coordinate transformation mode
   const std::string transformation_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr);
-  ORT_RETURN_IF_NOT(ArrayHasString(supported_coord_transf_modes, transformation_mode),
+  ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_coord_transf_modes, transformation_mode),
                     "QNN EP: Resize does not support coordinate_transformation_mode ", transformation_mode.c_str());
 
-  // Check nearest mode
+  const auto& input_0 = node_unit.Inputs()[0];
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
+                    "QNN EP: Cannot get shape for Resize input");
+  const size_t input_rank = input_shape.size();
+
+  // Validate Resize w/ "nearest" mode.
+  // Translation matrix of ONNX Resize w/ "nearest" mode on HTP backend.
+  // Table entries correspond to the QNN operator used for the given configuration
+  // (Resize = QNN Resize op, RNN = QNN ResizeNearestNeighbor op, X = Unsupported).
+  //
+  //                                                   nearest_mode:
+  // coordinate_transformation_mode: | round_prefer_floor  round_prefer_ceil  floor  ceil
+  // -----------------------------------------------------------------------------------------
+  //                      half_pixel |     Resize               X              RNN     X
+  //              pytorch_half_pixel |     Resize               X               X      X
+  //                   align_corners |     Resize               X              RNN     X
+  //                      asymmetric |     Resize               X              RNN     X
+
   if (interp_mode == "nearest") {
     const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr);
-    ORT_RETURN_IF_NOT(ArrayHasString(supported_nearest_modes, nearest_mode),
+    ORT_RETURN_IF_NOT(IsOnnxAttrModeSupported(supported_nearest_modes, nearest_mode),
                       "QNN EP: Resize does not support nearest_mode ", nearest_mode.c_str());
 
-    // TODO: Support 'asymmetric' transformation mode with nearest_mode != 'floor'.
-    //
-    // QNN's ONNX converter tool translates 'nearest' + 'asymmetric' (regardless of rounding mode)
-    // to QNN's ResizeNearestNeighbor with {align_corners: 0, half_pixel: 0}.
-    // This is only accurate if the rounding mode is "floor". Need to investigate how to handle
-    // other rounding modes with Qualcomm. Ideally, we would use QNN's Resize operator, but it doesn't support
-    // the "asymmetric" coordinate transformation mode on HTP.
-    ORT_RETURN_IF(transformation_mode == "asymmetric" && nearest_mode != "floor",
-                  "QNN EP: Resize with coordinate_transformation_mode 'asymmetric' and nearest_mode '", nearest_mode,
-                  "' is not currently supported on the HTP backend.");
+    if (is_npu_backend) {
+      // QNN only supports the following nearest_mode values on HTP:
+      // - "round_prefer_floor" via QNN's Resize operator
+      // - "floor" via QNN's ResizeNearestNeighbor operator
+      //
+      // QNN validation does not throw an error if unsupported nearest_mode values are used, so we have to
+      // catch them here. Otherwise, accuracy is significantly degraded.
+      ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor",
+                        "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
+
+      const bool use_resize_nn_op = nearest_mode == "floor";
+
+      // If HTP uses ResizeNearestNeighbor ("floor"), then the "pytorch_half_pixel" coordinate_transformation_mode
+      // is not supported.
+      ORT_RETURN_IF(use_resize_nn_op && transformation_mode == "pytorch_half_pixel",
+                    "QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
+                    " and coordinate_transformation_mode == 'pytorch_half_pixel'.");
+
+      // QNN's ResizeNearestNeighbor requires rank 4 inputs.
+      ORT_RETURN_IF(use_resize_nn_op && input_rank != 4,
+                    "QNN EP: Resize on the NPU with nearest_mode == 'floor' requires an input with rank 4.");
+    }
   }
 
-  // Check that input shape has at least a rank of 3.
-  const auto& input_0 = node_unit.Inputs()[0];
-  std::vector<uint32_t> input_shape;
-  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
-                    "QNN EP: Cannot get shape for Resize input");
-  ORT_RETURN_IF(input_shape.size() < 3, "QNN EP: Resize input must have a rank >= 3.");
+  // Check that the input shape has at least a rank of 3 (and a max of 5 on HTP).
+  ORT_RETURN_IF(input_rank < 3 || (is_npu_backend && input_rank > 5),
+                "QNN EP: Resize input must have a rank >= 3. The maximum rank is 5 on the NPU.");
 
   const auto& output_0 = node_unit.Outputs()[0];
   std::vector<uint32_t> output_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output_0.node_arg, output_shape),
                     "QNN EP: Cannot get shape for Resize output");
-  ORT_RETURN_IF(output_shape.size() < 3, "QNN EP: Resize output must have a rank >= 3.");
+
+  // Check that only the spatial dimensions (width, height) are resized. The batch_size (N) and channels (C) should
+  // be untouched. This code runs before layout transformation, so we know that the current layout is "channel first"
+  // (e.g., N, C, S1, S2, ..., SN), and that the minimum rank is 3.
+  assert(node_unit.Domain() != kMSInternalNHWCDomain);
+  ORT_RETURN_IF_NOT(input_shape[0] == output_shape[0] && input_shape[1] == output_shape[1],
+                    "QNN EP: Resize may only change the spatial dimensions.");
+
+  if (!is_npu_backend) {
+    ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type();
+    ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
+                  "QNN EP: Data type ", input_data_type->c_str(),
+                  " is not supported for Resize operator in CPU backend.");
+  }
 
   return Status::OK();
 }
@@ -305,92 +228,34 @@ Status ResizeOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
                                                     std::vector<std::string>&& input_names,
                                                     const logging::Logger& logger,
                                                     bool do_op_validation) const {
-  // The QNN Resize op does not currently work with the QNN cpu backend, but works with the HTP backend. Therefore, we
-  // currently use QNN's Resize op for quantized models and either ResizeBilinear or ResizeNearestNeighbor for
-  // non-quantized models. This requires separate handling for quantized models.
-  // TODO: Use only Resize once QNN's Resize op works in the QNN cpu backend.
-  bool is_quantized_node = NodeUnit::Type::QDQGroup == node_unit.UnitType();
-  return is_quantized_node ? ProcessQDQOpAttrsAndOutputs(qnn_model_wrapper, node_unit, std::move(input_names), logger, do_op_validation) : ProcessOpAttrsAndOutputs(qnn_model_wrapper, node_unit, std::move(input_names), logger, do_op_validation);
-}
-
-Status ResizeOpBuilder::ProcessOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                                 const NodeUnit& node_unit,
-                                                 std::vector<std::string>&& input_names,
-                                                 const logging::Logger& logger,
-                                                 bool do_op_validation) const {
-  ORT_UNUSED_PARAMETER(logger);
-  NodeAttrHelper node_helper(node_unit);
-  const std::string resize_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
-  std::string qnn_node_type = "ResizeNearestNeighbor";
-  if ("linear" == resize_mode) {
-    qnn_node_type = "ResizeBilinear";
-  }
-
-  const std::string coordinate_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr);
-
-  Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT;
-  qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8;
-  qnn_align_corners.bool8Value = static_cast<uint8_t>(0);
-
-  Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT;
-  qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8;
-  qnn_half_pixel.bool8Value = static_cast<uint8_t>(0);
-
-  if ("align_corners" == coordinate_mode) {
-    qnn_align_corners.bool8Value = static_cast<uint8_t>(1);
-  } else if ("half_pixel" == coordinate_mode) {
-    qnn_half_pixel.bool8Value = static_cast<uint8_t>(1);
-  }
-  QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(),
-                                          QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS, qnn_align_corners);
-  QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(),
-                                       QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel);
-
-  std::vector<std::string> param_tensor_names;
-  param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param));
-  param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName());
-  qnn_model_wrapper.AddParamWrapper(std::move(qnn_half_pixel_param));
-
-  return ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names), std::move(param_tensor_names),
-                        logger, do_op_validation, qnn_node_type);
-}
-
-Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                                    const NodeUnit& node_unit,
-                                                    std::vector<std::string>&& input_names,
-                                                    const logging::Logger& logger,
-                                                    bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
   NodeAttrHelper node_helper(node_unit);
 
   const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
   const std::string transformation_mode = GetOnnxAttr(node_helper, onnx_coord_transf_mode_attr);
+  const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr);
+  const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
   std::string qnn_op_type = "Resize";
 
-  // Handle Resize with {mode: "nearest", coordinate_transformation_mode: "asymmetric"} uniquely.
-  // QNN's ONNX converter tool translates this configuration (regardless of rounding mode)
-  // to QNN's ResizeNearestNeighbor with {align_corners: 0, half_pixel: 0}.
-  //
-  // NOTE: This is only accurate if the rounding mode is "floor". Need to investigate how to handle
-  // other rounding modes with Qualcomm. Ideally, we would use QNN's Resize operator, but it doesn't support
-  // the "asymmetric" coordinate transformation mode on HTP.
-  if (interp_mode == "nearest" && transformation_mode == "asymmetric") {
+  // Translate Resize with {mode: "nearest", nearest_mode: "floor", coordinate_transformation_mode: XXX} to
+  // QNN's ResizeNearestNeighbor operator on the HTP backend. This combination of parameters is not supported on HTP
+  // via QNN's Resize operator. Note that QNN's ResizeNearestNeighbor operator always uses "floor" rounding.
+  if (is_npu_backend && interp_mode == "nearest" && nearest_mode == "floor") {
     qnn_op_type = "ResizeNearestNeighbor";
 
-    // Set parameter 'align_corners' to 0
+    // Parameter 'align_corners'
     Qnn_Scalar_t qnn_align_corners = QNN_SCALAR_INIT;
     qnn_align_corners.dataType = QNN_DATATYPE_BOOL_8;
-    qnn_align_corners.bool8Value = static_cast<uint8_t>(0);
+    qnn_align_corners.bool8Value = static_cast<uint8_t>(transformation_mode == "align_corners");
     QnnParamWrapper qnn_align_corners_param(node_unit.Index(), node_unit.Name(),
                                             QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS, qnn_align_corners);
     param_tensor_names.push_back(qnn_align_corners_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(qnn_align_corners_param));
 
-    // Set parameter 'half_pixel_centers' to 0
+    // Parameter 'half_pixel_centers'
     Qnn_Scalar_t qnn_half_pixel = QNN_SCALAR_INIT;
     qnn_half_pixel.dataType = QNN_DATATYPE_BOOL_8;
-    qnn_half_pixel.bool8Value = static_cast<uint8_t>(0);
+    qnn_half_pixel.bool8Value = static_cast<uint8_t>(transformation_mode == "half_pixel");
     QnnParamWrapper qnn_half_pixel_param(node_unit.Index(), node_unit.Name(),
                                          QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS, qnn_half_pixel);
     param_tensor_names.push_back(qnn_half_pixel_param.GetParamTensorName());
@@ -399,11 +264,12 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w
     // Parameter 'transformation_mode'
     Qnn_Scalar_t qnn_transformation_mode = QNN_SCALAR_INIT;
     qnn_transformation_mode.dataType = QNN_DATATYPE_UINT_32;
-    ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_coord_transf_modes, transformation_mode,
-                                             "coordinate_transformation_mode", qnn_transformation_mode.uint32Value));
+    ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_coord_transf_modes, transformation_mode,
+                                                    "coordinate_transformation_mode",
+                                                    qnn_transformation_mode.uint32Value));
 
-    QnnParamWrapper qnn_transformation_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE,
-                                                  qnn_transformation_mode);
+    QnnParamWrapper qnn_transformation_mode_param(node_unit.Index(), node_unit.Name(),
+                                                  QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE, qnn_transformation_mode);
     param_tensor_names.push_back(qnn_transformation_mode_param.GetParamTensorName());
     qnn_model_wrapper.AddParamWrapper(std::move(qnn_transformation_mode_param));
 
@@ -420,7 +286,7 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w
     // Parameter 'interpolation_mode'
     Qnn_Scalar_t qnn_interp_mode = QNN_SCALAR_INIT;
     qnn_interp_mode.dataType = QNN_DATATYPE_UINT_32;
-    ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_modes, interp_mode, "mode", qnn_interp_mode.uint32Value));
+    ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_modes, interp_mode, "mode", qnn_interp_mode.uint32Value));
 
     QnnParamWrapper qnn_interp_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_INTERPOLATION_MODE,
                                           qnn_interp_mode);
@@ -429,11 +295,10 @@ Status ResizeOpBuilder::ProcessQDQOpAttrsAndOutputs(QnnModelWrapper& qnn_model_w
 
     // Parameter 'nearest_mode'. Processed only when 'interpolation_mode' is NEAREST(0).
     if (qnn_interp_mode.uint32Value == 0) {
-      const std::string nearest_mode = GetOnnxAttr(node_helper, onnx_nearest_mode_attr);
       Qnn_Scalar_t qnn_nearest_mode = QNN_SCALAR_INIT;
       qnn_nearest_mode.dataType = QNN_DATATYPE_UINT_32;
-      ORT_RETURN_IF_ERROR(GetQnnModeFromString(supported_nearest_modes, nearest_mode, "nearest_mode",
-                                               qnn_nearest_mode.uint32Value));
+      ORT_RETURN_IF_ERROR(GetQnnModeValFromOnnxString(supported_nearest_modes, nearest_mode, "nearest_mode",
+                                                      qnn_nearest_mode.uint32Value));
 
       QnnParamWrapper qnn_nearest_mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_RESIZE_PARAM_NEAREST_MODE,
                                              qnn_nearest_mode);
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 832a8a744c08b..0434b16dc66ce 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -99,9 +99,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   // CUDA: result mismatch due to not implementing NHWC support
   // TensorRT: results mismatch
   // ROCm: results mismatch
-  // QNN: conflict with layout transformer, need furture investigation
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+           {kCudaExecutionProvider, kTensorrtExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_uint8) {
@@ -131,7 +130,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapolation_int8) {
@@ -159,7 +158,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extr
                            10, 10, 10};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
+  test.Run();
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_uint8) {
@@ -188,7 +187,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_extrapolation_int8) {
@@ -215,7 +214,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_tf_crop_and_resize_without_e
                            0, 0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
+  test.Run();
 }
 
 TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
@@ -261,9 +260,8 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) {
   test.AddOutput<float>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  // QNN: conflict with layout transformer, need furture investigation
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+           {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
@@ -287,7 +285,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) {
   test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
@@ -309,7 +307,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) {
   std::vector<int8_t> Y = {0, 0};
 
   test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
+  test.Run();
 }
 
 // Since NNAPI(TFLite) only using the scale calculate using the input/output size
@@ -399,7 +397,9 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
     std::vector<float> Y = {1.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-    test.Run();
+
+    // QNN: result mismatch ("NaN" instead of 1.0f on QNN CPU backend)
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});
   };
 
   run_test(false);
@@ -435,7 +435,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_uin
     test.AddOutput<uint8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -465,7 +465,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_align_corners_int
 
     test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C}, Y);
     // TensorRT: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
   };
 
   run_test(false);
@@ -532,7 +532,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   test.AddOutput<uint8_t>("Y", {N, sizes[1], sizes[2], C}, Y);
   // CUDA: result mismatch due to not implementing NHWC support
   // ROCm: results mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixel_int8) {
@@ -560,7 +560,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_pytorch_half_pixe
   std::vector<int8_t> Y = {0, 2, -9};
 
   test.AddOutput<int8_t>("Y", {N, sizes[1], sizes[2], C}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});  // TensorRT: results mismatch
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
 }
 
 TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
@@ -641,7 +641,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_uint8) {
                             Y, false, .0f, 1.0f);
     // CUDA: result mismatch due to not implementing NHWC support
     // ROCm: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
   };
 
   run_test(false);
@@ -683,7 +683,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearUpSampleTest_4DBilinear_asymmetric_int8) {
     test.AddOutput<int8_t>("Y", {N, static_cast<int64_t>(H * scales[1]), static_cast<int64_t>(W * scales[2]), C},
                            Y, false, .0f, 1.0f);
     // TensorRT: results mismatch
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
   };
 
   run_test(false);
@@ -1079,7 +1079,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
                           13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f};
 
   test.AddOutput<float>("Y", {N, C, static_cast<int64_t>(H * scales[2]), static_cast<int64_t>(W * scales[3])}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider});  // QNN: result diff
+  test.Run();
 }
 
 TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
@@ -1887,7 +1887,7 @@ void TestAntialiasing(std::map<std::string, std::string> attributes,
 
   test.AddOutput<T>("Y", output_shape, output_data);
   // TensorRT 8.5 supports operators up to Opset 17. Temporarily exclude TensorRT EP due to accurarcy issue.
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kQnnExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 TEST(ResizeOpTest, Antialias_Bilinear_No_ExcludeOutside) {
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index cf336ca9eeb8b..cd6865d443cc0 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -120,7 +120,7 @@ static void RunCPUResizeOpTest(const TestInputDef<float>& input_def, const std::
                                const std::string& mode, const std::string& coordinate_transformation_mode,
                                const std::string& nearest_mode,
                                ExpectedEPNodeAssignment expected_ep_assignment,
-                               int opset = 11) {
+                               int opset = 19) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnCpu.dll";
@@ -138,7 +138,7 @@ static void RunCPUResizeOpTestWithScales(const TestInputDef<float>& input_def, c
                                          const std::string& mode, const std::string& coordinate_transformation_mode,
                                          const std::string& nearest_mode,
                                          ExpectedEPNodeAssignment expected_ep_assignment,
-                                         int opset = 11) {
+                                         int opset = 19) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnCpu.dll";
@@ -157,7 +157,8 @@ static void RunQDQResizeOpTest(const TestInputDef<float>& input_def,
                                const std::vector<int64_t>& sizes_data,
                                const std::string& mode, const std::string& coordinate_transformation_mode,
                                const std::string& nearest_mode,
-                               ExpectedEPNodeAssignment expected_ep_assignment) {
+                               ExpectedEPNodeAssignment expected_ep_assignment,
+                               int opset = 19) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -169,27 +170,20 @@ static void RunQDQResizeOpTest(const TestInputDef<float>& input_def,
                        GetQDQResizeModelBuilder<QuantType>(input_def, sizes_data, mode, coordinate_transformation_mode,
                                                            nearest_mode),
                        provider_options,
-                       18,  // opset
-                       expected_ep_assignment,
-                       1e-5f);
+                       opset,
+                       expected_ep_assignment);
 }
 
 //
 // CPU tests:
 //
 
-// TODO: Our QNN CPU translation of ONNX Resize with "nearest" mode uses QNN's ResizeNearestNeighbor
-// operator, which does not have a way to specify rounding (i.e., "nearest_mode" in ONNX). It is not clear
-// what kind of rounding QNN's ResizeNearestNeighbor uses. Therefore, we do not yet know how to compare
-// ONNX Resize to QNN ResizeNearestNeighbor. These tests should remain disabled until this behavior is
-// clarified. If, for example, it turns out that ResizeNearestNeighbor uses "floor" rounding, then we should
-// only compare against ONNX resize with "floor" rounding.
-
 // Upsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpf) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, -10.0f, 10.0f),  // Random input w/ range [-10, 10]
-                     {1, 2, 21, 10},                                           // Sizes
+TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestHalfPixel_rpf) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 70);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, input_data),
+                     {1, 2, 21, 10},  // Sizes
                      "nearest",
                      "half_pixel",
                      "round_prefer_floor",
@@ -198,57 +192,72 @@ TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpf) {
 
 // Upsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestHalfPixel_rpc) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestHalfPixel_rpc) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 7, 5}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpc) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestHalfPixel_rpc) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 1, 3}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "half_pixel"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestHalfPixel_rpf) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestHalfPixel_rpf) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 1, 2}, "nearest", "half_pixel", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Upsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-// QNN v2.13: index #50 don't match, which is 4.67152 from -1.93515
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpf) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAlignCorners_rpf) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 70);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, input_data),
                      {1, 2, 21, 10}, "nearest", "align_corners", "round_prefer_floor",
                      ExpectedEPNodeAssignment::All);
 }
 
+// Upsample that uses "round_prefer_floor" as the "nearest_mode".
+// coordinate_transformation_mode: "asymmetric"
+TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAsymmetric_rpf) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 70);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 2, 7, 5}, false, input_data),
+                     {1, 2, 21, 10}, "nearest", "asymmetric", "round_prefer_floor",
+                     ExpectedEPNodeAssignment::All);
+}
+
 // Upsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeUpsampleNearestAlignCorners_rpc) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeUpsampleNearestAlignCorners_rpc) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 7, 5}, "nearest", "align_corners", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_ceil" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpc) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestAlignCorners_rpc) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 1, 3}, "nearest", "align_corners", "round_prefer_ceil",
                      ExpectedEPNodeAssignment::All);
 }
 
 // Downsample that uses "round_prefer_floor" as the "nearest_mode".
 // coordinate_transformation_mode: "align_corners"
-TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpf) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, -10.0f, 10.0f),
+TEST_F(QnnCPUBackendTests, ResizeDownsampleNearestAlignCorners_rpf) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                      {1, 1, 1, 2}, "nearest", "align_corners", "round_prefer_floor",
                      ExpectedEPNodeAssignment::All);
 }
@@ -258,76 +267,177 @@ TEST_F(QnnCPUBackendTests, DISABLED_ResizeDownsampleNearestAlignCorners_rpf) {
 //
 
 TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 60);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, input_data),
                      {1, 3, 8, 10}, "linear", "half_pixel", "",
                      ExpectedEPNodeAssignment::All);
 }
 
 TEST_F(QnnCPUBackendTests, Resize2xLinearHalfPixel_scales) {
-  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 60);
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, input_data),
                                {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "half_pixel", "",
                                ExpectedEPNodeAssignment::All);
 }
 
 TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners) {
-  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 60);
+  RunCPUResizeOpTest(TestInputDef<float>({1, 3, 4, 5}, false, input_data),
                      {1, 3, 8, 10}, "linear", "align_corners", "",
                      ExpectedEPNodeAssignment::All);
 }
 
 TEST_F(QnnCPUBackendTests, Resize2xLinearAlignCorners_scales) {
-  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 60);
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 3, 4, 5}, false, input_data),
                                {1.0f, 1.0f, 2.0f, 2.0f}, "linear", "align_corners", "",
                                ExpectedEPNodeAssignment::All);
 }
 
+// Test Resize downsample with mode: "linear", coordinate_transformation_mode: "align_corners"
+// TODO: Enable ResizeOpTest.ResizeOpLinearDownSampleTest_4DBilinear_align_corners in cpu resize_op tests when fixed.
+//
+// Input f32[1,1,2,4]: 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
+// Expected output f32[1, 1, 1, 2]: 1.0, 4.0
+// Actual output f32[1, 1, 1, 2]: NaN, NaN
+TEST_F(QnnCPUBackendTests, DISABLED_Resize_DownSample_Linear_AlignCorners_scales) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
+                               {1.0f, 1.0f, 0.6f, 0.6f}, "linear", "align_corners", "",
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Test Resize downsample with mode: "linear", coordinate_transformation_mode: "half_pixel"
+// TODO: Enable ResizeOpTest.ResizeOpLinearDownSampleTest_4DBilinear cpu resize_op tests when fixed.
+//
+// Input f32[1,1,2,4]: 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
+// Expected output f32[1, 1, 1, 2]: 2.6666 4.3333
+// Actual output f32[1, 1, 1, 2]: NaN, NaN
+TEST_F(QnnCPUBackendTests, DISABLED_Resize_DownSample_Linear_HalfPixel_scales) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunCPUResizeOpTestWithScales(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
+                               {1.0f, 1.0f, 0.6f, 0.6f}, "linear", "half_pixel", "",
+                               ExpectedEPNodeAssignment::All);
+}
+
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 //
 // HTP tests:
 //
 
+// Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "align_corners"
+TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_AlignCorners) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
+                              {1, 1, 1, 2}, "linear", "align_corners", "",
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test QDQ Resize downsample with mode: "linear", coordinate_transformation_mode: "half_pixel"
+TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_HalfPixel) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
+                              {1, 1, 1, 2}, "linear", "half_pixel", "",
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "pytorch_half_pixel"
+// QNN EP uses QNN's Resize op.
 TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 8, 8}, "linear", "pytorch_half_pixel", "",
                               ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                              {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor",
+// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "half_pixel"
+// QNN EP uses QNN's Resize op.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearHalfPixel) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "linear", "half_pixel", "",
                               ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                              {1, 3, 8, 8}, "nearest", "asymmetric", "floor",
+// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "align_corners"
+// QNN EP uses QNN's Resize op.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAlignCorners) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "linear", "align_corners", "",
                               ExpectedEPNodeAssignment::All);
 }
 
-// TODO: Investigate with Qualcomm. The qnn-onnx-converter tool translates ONNX Resize [nearest, asymmetric, ceil] to
-// QNN ResizeNearestNeighbor {align_corners: 0, half_pixel: 0}, which is NOT equivalent. It would be better to use
-// QNN's own Resize operator (instead of ResizeNearestNeighbor), but it doesn't support the "asymmetric" coordinate
-// transform mode.
-//
-// QNN v2.13: Inaccuracy detected for output 'output', element 189.
-// Output quant params: scale=0.078431375324726105, zero_point=127.
-// Expected val: -2.663428783416748
-// QNN QDQ val: 7.4509806632995605 (err 10.114409446716309)
-// CPU QDQ val: -2.6666667461395264 (err 0.0032379627227783203)
-TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_2xNearestAsymmetricCeil) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
-                              {1, 3, 8, 8}, "nearest", "asymmetric", "ceil",
+// Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "asymmetric"
+// QNN EP uses QNN's Resize op.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAsymmetric) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "linear", "asymmetric", "",
                               ExpectedEPNodeAssignment::All);
 }
 
+// Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "half_pixel", nearest_mode: "round_prefer_floor"
+// QNN EP uses QNN's Resize op.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestHalfPixelRoundPreferFloor) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "nearest", "half_pixel", "round_prefer_floor",
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test that the nearest_mode "ceil" is not supported on the HTP backend.
+TEST_F(QnnHTPBackendTests, ResizeU8_NearestModeCeil_Unsupported) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
+                              {1, 3, 8, 8}, "nearest", "asymmetric", "ceil",
+                              ExpectedEPNodeAssignment::None);
+}
+
+// Test 3x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor".
+// QNN EP uses QNN's ResizeNearestNeighbor op.
 TEST_F(QnnHTPBackendTests, ResizeU8_3xNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 12, 12}, "nearest", "asymmetric", "floor",
                               ExpectedEPNodeAssignment::All);
 }
 
+// Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "round_prefer_floor"
+// QNN EP uses QNN's Resize op.
+TEST_F(QnnHTPBackendTests, ResizeU8_2xNearestAsymmetricRoundPreferFloor) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 8);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false, input_data),
+                              {1, 2, 4, 4}, "nearest", "asymmetric", "round_prefer_floor",
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test 3x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "round_prefer_floor"
+// QNN EP uses QNN's Resize op.
+//
+// TODO: Inaccuracy detected for output 'output_0', element 2.
+// Output quant params: scale=0.078431375324726105, zero_point=127.
+// Expected val: -3.3333334922790527
+// QNN QDQ val: -9.960784912109375 (err 6.6274514198303223)
+// CPU QDQ val: -3.2941176891326904 (err 0.039215803146362305)
+//
+// More debugging info:
+// Input elements f32[1,1,2,2] = -10.0000000 -3.33333349 3.33333302 10.0000000
+// ORT CPU EP (f32 model) outputs: -10.0000000 -10.0000000 -3.33333349 -3.33333349 -3.33333349 -3.33333349 -10.00 ...
+// ORT CPU EP (qdq model) outputs: -9.96078491 -9.96078491 -3.29411769 -3.29411769 -3.29411769 -3.29411769 -9.961 ...
+// ORT QNN EP (qdq model) outputs: -9.96078491 -9.96078491 -9.96078491 -3.37254906 -3.37254906 -3.37254906 -9.961 ...
+TEST_F(QnnHTPBackendTests, DISABLED_ResizeU8_3xNearestAsymmetricRoundPreferFloor) {
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 4);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 2}, false, input_data),
+                              {1, 1, 6, 6}, "nearest", "asymmetric", "round_prefer_floor",
+                              ExpectedEPNodeAssignment::All);
+}
+
+// Test 0.5x QDQ Resize mode: "nearest", coordinate_transformation_mode: "asymmetric", nearest_mode: "floor"
+// QNN EP uses QNN's ResizeNearestNeighbor op.
 TEST_F(QnnHTPBackendTests, ResizeU8_HalfNearestAsymmetricFloor) {
-  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+  std::vector<float> input_data = GetFloatDataInRange(-10.0f, 10.0f, 48);
+  RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 2, 2}, "nearest", "asymmetric", "floor",
                               ExpectedEPNodeAssignment::All);
 }

From ce287a4e77895e7f6147a044ae5c723a48cb8277 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Sat, 23 Sep 2023 07:06:04 +0800
Subject: [PATCH 14/14] [WebNN EP] Remove workaround for dynamic shape (#17644)

As now we have the FreeDimensionOverrides option to support dynamic
shape, we can remove the previous
workaround.
---
 onnxruntime/core/providers/webnn/builders/helper.cc      | 7 +++++--
 .../core/providers/webnn/builders/model_builder.cc       | 9 +++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index 31453e005272e..774df067fe347 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -53,9 +53,12 @@ bool IsInputSupported(const NodeArg& input, const std::string& parent_name, cons
   }
 
   for (const auto& dim : shape_proto->dim()) {
-    // For now we workaround dynamic shape support by assuming 1.
+    // WebNN doesn't support dynamic shape - use sessionOptions.freeDimensionOverrides to fix the shape.
     if (!dim.has_dim_value()) {
-      LOGS(logger, VERBOSE) << "Dynamic shape is not supported for now, assume to be 1, for input:" << input_name;
+      LOGS(logger, VERBOSE) << "Dynamic shape is not supported, "
+                            << "use sessionOptions.FreeDimensionOverrides to set a fixed shape for input: "
+                            << input_name;
+      return false;
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 14ca4f1a1e674..2eae8cebbbd66 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -218,12 +218,9 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     } else {
       dims.reserve(shape.size());
       for (const auto& dim : shape) {
-        if (!dim.has_dim_value()) {
-          // FIXME: support dyanmic shape.
-          dims.push_back(1);
-        } else {
-          dims.push_back(SafeInt<int32_t>(dim.dim_value()));
-        }
+        // dim_param free dimensions should have already been excluded by IsInputSupported().
+        assert(dim.has_dim_value());
+        dims.push_back(SafeInt<int32_t>(dim.dim_value()));
       }
     }
   }