diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
deleted file mode 100644
index bf73fede16c7f..0000000000000
--- a/.github/actions/rust-toolchain-setup/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
-
-name: 'Rust toolchain setup'
-description: 'Common setup steps for GitHub workflows for Rust projects'
-
-runs:
-  using: composite
-  steps:
-    - uses: dtolnay/rust-toolchain@1.71.0
-      with:
-        components: clippy, rustfmt
-    - uses: extractions/setup-just@v1
-      with:
-        just-version: '1.15.0' # optional semver specification, otherwise latest
-
-    ###
-    ### Linux setup
-    ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Linux') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: bash
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Linux') }}
-    #   run: just cargo-login-ci
-    #   shell: bash
-
-      ###
-      ### Windows setup
-      ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Windows') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: pwsh
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Windows') }}
-    #   run: just cargo-login-ci-windows
-    #   shell: pwsh
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
deleted file mode 100644
index 725c40c2ded53..0000000000000
--- a/.github/workflows/rust-ci.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-name: Rust
-
-on: [pull_request]
-
-env:
-  CARGO_TERM_COLOR: always
-  RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
-  RUST_BACKTRACE: 1
-  MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
-
-jobs:
-  fmt:
-    name: Rustfmt
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - name: fmt
-        run: cargo fmt --all -- --check
-
-  download:
-    name: Download prebuilt ONNX Runtime archive from build.rs
-    runs-on: ubuntu-latest
-    env:
-      ORT_RUST_STRATEGY: download
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - run: rustup target install x86_64-unknown-linux-gnu
-      - run: rustup target install x86_64-apple-darwin
-      - run: rustup target install i686-pc-windows-msvc
-      - run: rustup target install x86_64-pc-windows-msvc
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
-        run: cargo build --target x86_64-apple-darwin  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
-        run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
-        run: cargo build --target i686-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
-        run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
-
-  test:
-    name: Test Suite
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        target:
-          [
-            x86_64-unknown-linux-gnu,
-            x86_64-apple-darwin,
-            x86_64-pc-windows-msvc,
-            i686-pc-windows-msvc,
-          ]
-        include:
-          - target: x86_64-unknown-linux-gnu
-            os: ubuntu-latest
-          - target: x86_64-apple-darwin
-            os: macos-latest
-          - target: x86_64-pc-windows-msvc
-            os: windows-latest
-          - target: i686-pc-windows-msvc
-            os: windows-latest
-    env:
-      CARGO_BUILD_TARGET: ${{ matrix.target }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: rustup target install ${{ matrix.target }}
-      - name: Install additional packages (macOS)
-        if: contains(matrix.target, 'x86_64-apple-darwin')
-        run: brew install libomp
-      - name: Build (cargo build)
-        run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build tests (cargo test)
-        run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build onnxruntime with 'model-fetching' feature
-        run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
-      - name: Test onnxruntime-sys
-        run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
-      - name: Test onnxruntime
-        run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
-
-  clippy:
-    name: Clippy
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
-
-  package-sys:
-    name: Package onnxruntime-sys
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: cargo package --allow-dirty --package onnxruntime-sys
diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
index b9de1b79e1d51..67f9d8b0ce392 100644
--- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
+++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
@@ -53,10 +53,6 @@ extends:
           BuildArch: x86
           PythonPackageName: pythonx86
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -72,11 +68,6 @@ extends:
           PythonPackageName: pythonx86
           Runtime: static
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-          Runtime: static
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -94,11 +85,9 @@ extends:
         dependsOn:
         - Windows_Packaging_x64_dynamic
         - Windows_Packaging_x86_dynamic
-        - Windows_Packaging_arm_dynamic
         - Windows_Packaging_arm64_dynamic
         - Windows_Packaging_x64_static
         - Windows_Packaging_x86_static
-        - Windows_Packaging_arm_static
         - Windows_Packaging_arm64_static
         condition: succeeded()
         steps:
@@ -120,12 +109,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm'
-
         - task: DownloadPipelineArtifact@0
           displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime'
           inputs:
@@ -144,12 +127,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime'
-
         - task: PowerShell@2
           displayName: 'Bundle NuGet and other binaries'
           inputs:
@@ -194,17 +171,7 @@ extends:
               $arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_static_runtime_nuget_package = $nupkgs[0].FullName
-              $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory)
+             
 
               $x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native')
               $x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static')
@@ -216,10 +183,7 @@ extends:
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static')
+              
               $uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native')
               $uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0')
 
@@ -228,8 +192,6 @@ extends:
               New-Item -Path $x86_static_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_static_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_static_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib'))                    $x86_runtime_path_new
@@ -241,11 +203,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   $arm_runtime_path_new
-
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
@@ -261,11 +218,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
 
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
-
               Copy-Item -Recurse $uap_build_path_old $uap_build_path_new
 
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
@@ -304,22 +256,13 @@ extends:
               $arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
               $x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-
+              
               New-Item -Path $x86_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $x86_runtime_path_new
@@ -327,9 +270,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb'))                  $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $arm_runtime_path_new
-
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
               if (!(Test-Path $merged_nuget_path)) {
                   New-Item -Path $merged_nuget_path -ItemType Directory
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2f2adc78f6de9..3e2b1f31dd6cf 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,7 +11,7 @@
         // Auto sort imports
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 34355fb0fd936..1567da90cacfc 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -131,6 +131,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
 option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
 option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
 option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
+option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
 option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
 option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
 option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -354,13 +355,7 @@ if (onnxruntime_USE_ROCM)
   endif()
 endif()
 
-if (APPLE)
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-        message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-    endif()
-elseif (NOT WIN32 AND NOT APPLE)
-    message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
 
 # Single output director for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -493,6 +488,14 @@ endif()
 
 include(adjust_global_compile_flags.cmake)
 
+if (APPLE)
+  if (NOT CMAKE_OSX_ARCHITECTURES)
+    message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+  endif()
+elseif (NOT WIN32 AND NOT APPLE)
+  message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
 # We need to link with libatomic on systems that do not have built-in atomics, or
 # don't have built-in support for 8 byte atomics
 # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -639,7 +642,16 @@ else()
   check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
   check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
   check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+  if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+   check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+   if(NOT HAS_ARM64_BFLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support BFLOAT16!!!")
+   endif()
+   check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+   if(NOT HAS_ARM64_FLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support FLOAT16!!!")
+   endif()
+  endif()
   if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
     #we may have extra null pointer checkings in debug build, it's not an issue
     list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
@@ -1099,7 +1111,7 @@ function(onnxruntime_add_include_to_target dst_target)
 endfunction()
 
 # ACL
-if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
+if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
   set(onnxruntime_USE_ACL ON)
   if (onnxruntime_USE_ACL_1902)
     add_definitions(-DACL_1902=1)
@@ -1110,7 +1122,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
       if (onnxruntime_USE_ACL_2002)
         add_definitions(-DACL_2002=1)
       else()
-        add_definitions(-DACL_1905=1)
+	if (onnxruntime_USE_ACL_2308)
+	  add_definitions(-DACL_2308=1)
+	else()
+          add_definitions(-DACL_1905=1)
+	endif()
       endif()
     endif()
   endif()
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index e825bfeaea952..9f00c873715f4 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -300,6 +300,31 @@ if (MSVC)
   endif()
 else()
   if (NOT APPLE)
+    #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+    #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+    # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+    # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+    if(NOT CMAKE_SYSTEM_PROCESSOR)
+      message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+      # Try to detect it
+      if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        execute_process(
+		COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+		OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+		ERROR_VARIABLE _err
+		RESULT_VARIABLE _res
+		)
+		if(NOT _res EQUAL 0)
+			message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+		endif()
+		string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+		list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+		if(GCC_TRIPLET_LEN EQUAL 4)
+		  list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+          message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+        endif()
+      endif()
+    endif()
     set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
   endif()
   if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ffd..f15d5b8dd6f80 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
+
+if (onnxruntime_USE_ROCM)
+  add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index b026369e12c80..5e38789b65137 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -383,6 +383,7 @@ Do not modify directly.*
 |Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|StringConcat|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|20+|**T** = tensor(string)|
 |StringNormalizer|*in* X:**tensor(string)**<br> *out* Y:**tensor(string)**|10+|**X** = tensor(string)|
 |Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 06fef6bf72cc9..8cd0d0051d1eb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4528,6 +4528,19 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
+
+  /**
+   * Run fn in parallel
+   *
+   * \param[in] context
+   * \param[in] fn Function accepting usr_data and an integer as iterator
+   * \param[in] total The number of times fn is to be invoked
+   * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit
+   * \param[in] usr_data User data to be passed back to fn
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 16d9451624533..3773a01cb65a8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2057,6 +2057,7 @@ struct KernelContext {
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
   OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+  void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
 
  private:
   OrtKernelContext* ctx_;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 63e55603736b6..db4619eeeae62 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1658,6 +1658,10 @@ inline Logger KernelContext::GetLogger() const {
   return Logger{out};
 }
 
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+  ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
 inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
   Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
 }
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c1cf8af4bb80e..542eebe746d59 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -336,9 +336,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true,
       "funding": [
         {
@@ -1242,9 +1242,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true
     },
     "form-data": {
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index fb714bf5996f1..b3868871a4753 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -18,6 +18,10 @@ interface BuildDefinitions {
    * defines whether to disable the whole WebGpu backend in the build.
    */
   readonly DISABLE_WEBGPU: boolean;
+  /**
+   * defines whether to disable the whole WebNN backend in the build.
+   */
+  readonly DISABLE_WEBNN: boolean;
   /**
    * defines whether to disable the whole WebAssembly backend in the build.
    */
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 499327741c82b..4f1a3943de69a 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -28,7 +28,9 @@ if (!BUILD_DEFS.DISABLE_WASM) {
   registerBackend('wasm', wasmBackend, 10);
   if (BUILD_DEFS.DISABLE_TRAINING) {
     registerBackend('xnnpack', wasmBackend, 9);
-    registerBackend('webnn', wasmBackend, 9);
+    if (!BUILD_DEFS.DISABLE_WEBNN) {
+      registerBackend('webnn', wasmBackend, 9);
+    }
   }
 }
 
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 8e1ec782079be..90e02da986b8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
-import {attention, parseAttentionAttributes} from './ops/attention';
+import {attention} from './ops/attention';
 import {batchNorm} from './ops/batch-norm';
 import {biasAdd} from './ops/bias-add';
 import {biasSplitGelu} from './ops/bias-split-gelu';
@@ -16,11 +16,11 @@ import {expand} from './ops/expand';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
-import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {instanceNorm} from './ops/instance-norm';
+import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
-import {pad, parsePadAttributes} from './ops/pad';
+import {pad} from './ops/pad';
 import * as pool from './ops/pool';
 import {range} from './ops/range';
 import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
@@ -50,7 +50,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Asinh', [unaryOps.asinh]],
   ['Atan', [unaryOps.atan]],
   ['Atanh', [unaryOps.atanh]],
-  ['Attention', [attention, parseAttentionAttributes]],
+  ['Attention', [attention]],
   // TODO: support new attributes for AveragePool-10
   ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
   ['BatchNormalization', [batchNorm]],
@@ -82,8 +82,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
-  ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+  ['InstanceNormalization', [instanceNorm]],
+  ['LayerNormalization', [layerNorm]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],
@@ -95,7 +95,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['MultiHeadAttention', [multiHeadAttention, parseMultiHeadAttentionAttributes]],
   ['Neg', [unaryOps.neg]],
   ['Not', [unaryOps.not]],
-  ['Pad', [pad, parsePadAttributes]],
+  ['Pad', [pad]],
   ['Pow', [binaryOps.pow]],
   ['Range', [range]],
   ['Reciprocal', [unaryOps.reciprocal]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
index e1f2a47301bfb..ef8038dff487e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {tensorDataTypeEnumToString} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
-import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, tensorTypeToWsglValueType, UniformDataElementType, UniformsArrayType} from './common';
 
 export const enum AttentionQkvFormat {
   unknown,          // enum value not set, or depends on qkv projection implementation details
@@ -231,20 +231,8 @@ const validateAttentionInputs = (inputs: readonly TensorView[], attributes: Atte
   };
 };
 
-export const parseAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
-    createAttributeWithCacheKey({...attributes});
-
 export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView, n: number, d: number) => {
   const components = getMaxComponents(d);
-  const inputHelper = outputVariable('x', input.dataType, input.dims, components);
-
-  let threadMaxValue = 'threadMaxVector';
-  if (components === 2) {
-    threadMaxValue = 'max(threadMaxVector.x, threadMaxVector.y)';
-  } else if (components === 4) {
-    threadMaxValue = 'max(max(threadMaxVector.x, threadMaxVector.y), max(threadMaxVector.z, threadMaxVector.w))';
-  }
-  const dataType = tensorTypeToWsglStorageType(input.dataType);
   let WG = 64;
   const dComp = d / components;
   if (dComp < WG) {
@@ -253,25 +241,41 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     WG = Math.ceil(dComp / 8);
   }
   const elementsPerWG = Math.ceil(d / components / WG);
+  const tensorDataType = tensorDataTypeEnumToString(input.dataType) as ProgramUniform['type'];
+  const programUniforms: ProgramUniform[] =
+      [{type: tensorDataType, data: 1 / d}, {type: 'uint32', data: dComp}, {type: 'uint32', data: elementsPerWG}];
+  const dataType = tensorTypeToWsglStorageType(input.dataType, components);
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const inputHelper = outputVariable('x', input.dataType, input.dims, components);
+    let threadMaxValue = 'thread_max_vector';
+    if (components === 2) {
+      threadMaxValue = 'max(thread_max_vector.x, thread_max_vector.y)';
+    } else if (components === 4) {
+      threadMaxValue =
+          'max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))';
+    }
+    const elemValueType = tensorTypeToWsglValueType(input.dataType);
+    const uniforms: UniformsArrayType = [
+      {name: 'd_inv', type: elemValueType as UniformDataElementType}, {name: 'd_comp', type: 'u32'},
+      {name: 'elements_per_wg', type: 'u32'}
+    ];
 
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const dInv: ${dataType} = 1 / ${d};
-  const dComp = ${d / components};
+    return `
   var<workgroup> wgMax: array<f32, ${WG}>;
   var<workgroup> wgSum: array<f32, ${WG}>;
-
-  ${shaderHelper.declareVariables(inputHelper)}
-  @compute @workgroup_size(${WG}, 1, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-    @builtin(local_invocation_index) local_index : u32) {
-    let localOffset = local_index * ${elementsPerWG};
-    let offset: u32 = workgroup_id.x * dComp + localOffset;
-
-    var threadMaxVector = ${fillVector('f32', components, '-3.402823e+38f')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      threadMaxVector = max(${castToF32(dataType, components, 'x[offset + i]')}, threadMaxVector);
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(inputHelper)}
+  ${shaderHelper.mainStart([
+      WG, 1, 1
+    ])}
+    let localOffset = local_idx * uniforms.elements_per_wg;
+    let offset: u32 = workgroup_id.x * uniforms.d_comp + localOffset;
+
+    var thread_max_vector = ${fillVector('f32', components, '-3.402823e+38f')};
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      thread_max_vector = max(${castToF32(elemValueType, components, 'x[offset + i]')}, thread_max_vector);
     }
-    wgMax[local_index] = ${threadMaxValue};
+    wgMax[local_idx] = ${threadMaxValue};
     workgroupBarrier();
 
     var maxValue = -3.402823e+38f;
@@ -280,10 +284,10 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     var sumVector = ${fillVector('f32', components, '0')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      sumVector += exp(${castToF32(dataType, components, 'x[offset + i]')} - maxValue);
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      sumVector += exp(${castToF32(elemValueType, components, 'x[offset + i]')} - maxValue);
     }
-    wgSum[local_index] = ${sumVector('sumVector', components)};
+    wgSum[local_idx] = ${sumVector('sumVector', components)};
     workgroupBarrier();
 
     var sum: f32 = 0;
@@ -292,26 +296,24 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     if (sum == 0) {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        x[offset + i] = ${fillVector(dataType, components, 'dInv')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        x[offset + i] = ${fillVector('f32', components, 'uniforms.d_inv')};
       }
     } else {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        let f32input = ${castToF32(dataType, components, 'x[offset + i]')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        let f32input = ${castToF32(elemValueType, components, 'x[offset + i]')};
         x[offset + i] = ${inputHelper.type.value}(exp(f32input - maxValue) / sum);
       }
     }
   }`;
+  };
 
   context.compute(
       {
         name: 'AttentionProbsSoftmax',
-        shaderCache: {hint: `${d}`},
+        shaderCache: {hint: `${WG};${dataType};${components}`},
         getShaderSource,
-        getRunData: () => ({
-          outputs: [],
-          dispatchGroup: {x: n},
-        }),
+        getRunData: () => ({outputs: [], dispatchGroup: {x: n}, programUniforms}),
       },
       {inputs: [input], outputs: []});
 };
@@ -326,47 +328,43 @@ const computeAttentionProbs =
       // TODO: handle mask
 
       const alpha = attributes.scale === 0 ? 1.0 / Math.sqrt(parameters.headSize) : attributes.scale;
-
-      const dataType = tensorTypeToWsglStorageType(q.dataType);
-
       const components = getMaxComponents(parameters.headSize);
-      const qInput = inputVariable('q', q.dataType, q.dims, components);
-      const kInput = inputVariable('key', key.dataType, key.dims, components);
-      const output = outputVariable('output', q.dataType, probsShape);
-
       const vectorizedHeadSize = parameters.headSize / components;
-      const M = parameters.sequenceLength;
-      const N = parameters.totalSequenceLength;
-      const K = vectorizedHeadSize;
-
       const TILE_SIZE = 12;
-
       const dispatch = {
         x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE),
         y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
         z: parameters.batchSize * parameters.numHeads
       };
+      const tensorDataType = tensorDataTypeEnumToString(q.dataType) as ProgramUniform['type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: parameters.sequenceLength}, {type: 'uint32', data: vectorizedHeadSize},
+        {type: 'uint32', data: parameters.totalSequenceLength}, {type: 'uint32', data: parameters.kvSequenceLength},
+        {type: tensorDataType, data: alpha}
+      ];
 
       const inputs = [q, key];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha: ${dataType} = ${alpha};
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const qInput = inputVariable('q', q.dataType, q.dims, components);
+        const kInput = inputVariable('key', key.dataType, key.dims, components);
+        const output = outputVariable('output', q.dataType, probsShape);
+        const dataType = tensorTypeToWsglStorageType(q.dataType);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'kv_sequence_length', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType}
+        ];
+        return `
   const beta: ${dataType} = 1.0;
   const TILE_SIZE = ${TILE_SIZE}u;
 
   var<workgroup> tileQ: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileK: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(qInput, kInput, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qInput, kInput, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
     // x holds the N and y holds the M
     let headIdx = workgroup_id.z;
     let m = workgroup_id.y * TILE_SIZE;
@@ -374,40 +372,42 @@ const computeAttentionProbs =
     let lm = m + local_id.y;
     let ln = n + local_id.x;
 
-    let qOffset = ${parameters.sequenceLength * vectorizedHeadSize} * headIdx + m * K;
-    let kOffset = ${parameters.kvSequenceLength * vectorizedHeadSize} * headIdx + n * K;
+    let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;
+    let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K;
 
     var value = ${fillVector(dataType, components)};
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m + local_id.y < M && w + local_id.x < K) {
-        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * K + w + local_id.x];
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m + local_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
       }
-      if (n + local_id.y < N && w + local_id.x < K) {
-        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * K + w + local_id.x];
+      if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * uniforms.K + w + local_id.x];
       }
       workgroupBarrier();
 
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k];
       }
 
       workgroupBarrier();
     }
 
-    let headOffset = headIdx * M * N;
-    if (lm < M && ln < N) {
-      let outputIdx = headOffset + lm * N + ln;
-      output[outputIdx] = ${sumVector('value', components)} * alpha;
+    let headOffset = headIdx * uniforms.M * uniforms.N;
+    if (lm < uniforms.M && ln < uniforms.N) {
+      let outputIdx = headOffset + lm * uniforms.N + ln;
+      output[outputIdx] = ${sumVector('value', components)} * uniforms.alpha;
     }
   }`;
+      };
 
       const probs = context.compute(
           {
             name: 'AttentionProbs',
-            shaderCache: {hint: JSON.stringify(parameters)},
+            shaderCache: {hint: `${components}`, inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: probsShape, dataType: q.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -423,78 +423,76 @@ const computeAttentionProbs =
 const computeVxAttentionScore =
     (context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => {
       const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize];
-
-      const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
-      const vHelper = inputVariable('v', v.dataType, v.dims);
-      const output = outputVariable('output', probs.dataType, outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(probs.dataType);
-
       const TILE_SIZE = 12;
       const dispatch = {
         x: Math.ceil(params.vHeadSize / TILE_SIZE),
         y: Math.ceil(params.sequenceLength / TILE_SIZE),
         z: params.batchSize * params.numHeads
       };
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: params.sequenceLength}, {type: 'uint32', data: params.totalSequenceLength},
+        {type: 'uint32', data: params.vHeadSize}, {type: 'uint32', data: params.numHeads},
+        {type: 'uint32', data: params.vHiddenSize}
+      ];
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${params.sequenceLength}u;
-  const N: u32 = ${params.vHeadSize}u;
-  const K: u32 = ${params.totalSequenceLength}u;
-  const numHeads: u32 = ${params.numHeads}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
+        const vHelper = inputVariable('v', v.dataType, v.dims);
+        const output = outputVariable('output', probs.dataType, outputShape);
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'num_heads', type: 'u32'}, {name: 'v_hidden_size', type: 'u32'}
+        ];
+        return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
-  var<workgroup> tileQ: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-  var<workgroup> tileK: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(probsHelper, vHelper, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  var<workgroup> tileQ: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  var<workgroup> tileK: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(probsHelper, vHelper, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
    let headIdx = workgroup_id.z;
    let m = workgroup_id.y * TILE_SIZE + local_id.y;
    let n = workgroup_id.x * TILE_SIZE + local_id.x;
 
-   let offsetA = headIdx * (M * K) + m * K;
-   let offsetB = headIdx * (N * K) + n;
+   let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K;
+   let offsetB = headIdx * (uniforms.N * uniforms.K) + n;
 
-   var value = ${dataType}(0);
-   for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-     if (m < M && w + local_id.x < K) {
+   var value = ${probsHelper.type.storage}(0);
+   for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+     if (m < uniforms.M && w + local_id.x < uniforms.K) {
        tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
      }
-     if (n < N && w + local_id.y < K) {
-       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * N];
+     if (n < uniforms.N && w + local_id.y < uniforms.K) {
+       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * uniforms.N];
      }
      workgroupBarrier();
-     for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+     for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
        value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * k + local_id.x];
      }
      workgroupBarrier();
    }
 
    // we need to transpose output from BNSH_v to BSND_v
-   let batchIdx = workgroup_id.z / ${params.numHeads};
-   let currentBatchHeadNumber = workgroup_id.z % ${params.numHeads};
-   let headOffset = (batchIdx * M * ${params.numHeads} + currentBatchHeadNumber) * ${params.vHeadSize};
-   if (m < M && n < N) {
-     let outputIdx = batchIdx * ${params.sequenceLength * params.vHiddenSize} + m * ${params.vHiddenSize}
-       + currentBatchHeadNumber * ${params.vHeadSize} + n;
+   let batchIdx = workgroup_id.z / uniforms.num_heads;
+   let currentBatchHeadNumber = workgroup_id.z % uniforms.num_heads;
+   let headOffset = (batchIdx * uniforms.M * uniforms.num_heads + currentBatchHeadNumber) * uniforms.N;
+   if (m < uniforms.M && n < uniforms.N) {
+     let outputIdx = batchIdx * uniforms.M *uniforms.v_hidden_size + m * uniforms.v_hidden_size
+       + currentBatchHeadNumber * uniforms.N + n;
      output[outputIdx] = value;
    }
   }`;
+      };
 
       return context.compute(
           {
             name: 'AttentionScore',
-            shaderCache: {hint: JSON.stringify(params)},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: probs.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -517,71 +515,71 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
     parameters.sequenceLength,
     parameters.headSize,
   ];
-
-  const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
-
   const M = parameters.sequenceLength;
   const K = parameters.inputHiddenSize;
   const N = parameters.headSize;
-
   const TILE_SIZE = 12;
   const dispatch = {
     x: Math.ceil(parameters.headSize / TILE_SIZE),
     y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
     z: parameters.batchSize * parameters.numHeads
   };
+  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: M}, {type: 'uint32', data: K}, {type: 'uint32', data: N},
+    {type: 'uint32', data: parameters.numHeads}, {type: 'uint32', data: parameters.headSize},
+    {type: 'uint32', data: parameters.hiddenSize},
+    {type: 'uint32', data: parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}
+  ];
 
-  const getShaderSource = () => `
-  const M: u32 = ${M}u;
-  const K: u32 = ${K}u;
-  const N: u32 = ${N}u;
-  const numHeads: u32 = ${parameters.numHeads};
-  const ldb = ${parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}u;
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const outputQ = outputVariable('output_q', inputs[0].dataType, outputShape);
+    const outputK = outputVariable('output_k', inputs[0].dataType, outputShape);
+    const outputV = outputVariable('output_v', inputs[0].dataType, outputShape);
+    const input = inputVariable('input', inputs[0].dataType, inputs[0].dims);
+    const weight = inputVariable('weight', inputs[1].dataType, inputs[1].dims);
+    const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+    const dataType = input.type.storage;
+
+    const uniforms: UniformsArrayType = [
+      {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'num_heads', type: 'u32'},
+      {name: 'head_size', type: 'u32'}, {name: 'hidden_size', type: 'u32'}, {name: 'ldb', type: 'u32'}
+    ];
+    return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
   var<workgroup> tileInput: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightQ: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightK: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightV: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
-
-  @group(0) @binding(0) var<storage, read> input: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> weight: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> outputQ: array<${dataType}>;
-  @group(0) @binding(4) var<storage, read_write> outputK: array<${dataType}>;
-  @group(0) @binding(5) var<storage, read_write> outputV: array<${dataType}>;
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
-    let batchIndex = workgroup_id.z / ${parameters.numHeads};
-    let headNumber = workgroup_id.z % ${parameters.numHeads};
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(input, weight, bias, outputQ, outputK, outputV)}
+  ${shaderHelper.mainStart([
+      TILE_SIZE, TILE_SIZE, 1
+    ])}
+    let batchIndex = workgroup_id.z / uniforms.num_heads;
+    let headNumber = workgroup_id.z % uniforms.num_heads;
     let m = workgroup_id.y * TILE_SIZE + local_id.y;
     let n = workgroup_id.x * TILE_SIZE + local_id.x;
 
-    let inputOffset = batchIndex * (M * K) + m * K;
-    let biasOffsetQ = headNumber * ${parameters.headSize};
-    let biasOffsetK = ${parameters.hiddenSize} + biasOffsetQ;
-    let biasOffsetV = ${parameters.hiddenSize} + biasOffsetK;
+    let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+    let biasOffsetQ = headNumber * uniforms.head_size;
+    let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+    let biasOffsetV = uniforms.hidden_size + biasOffsetK;
 
     var valueQ = ${dataType}(0);
     var valueK = ${dataType}(0);
     var valueV = ${dataType}(0);
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m < M && w + local_id.x < K) {
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
         tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
       }
-      if (n < N && w + local_id.y < K) {
-        let offset = n + (w + local_id.y) * ldb;
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        let offset = n + (w + local_id.y) * uniforms.ldb;
         tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
         tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
         tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
       }
       workgroupBarrier();
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         let inputTileOffset = TILE_SIZE * local_id.y + k;
         let weightTileOffset = TILE_SIZE * k + local_id.x;
         valueQ += tileInput[inputTileOffset] * tileWeightQ[weightTileOffset];
@@ -592,26 +590,25 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
       workgroupBarrier();
     }
 
-    let headOffset = (m * N + n) % ${parameters.headSize};
+    let headOffset = (m * uniforms.N + n) % uniforms.head_size;
     valueQ += bias[headOffset + biasOffsetQ];
     valueK += bias[headOffset + biasOffsetK];
     valueV += bias[headOffset + biasOffsetV];
 
-    let offset = workgroup_id.z * M * N;
-    if (m < M && n < N) {
-      let outputIdx = offset + m * N + n;
-      outputQ[outputIdx] = valueQ;
-      outputK[outputIdx] = valueK;
-      outputV[outputIdx] = valueV;
+    let offset = workgroup_id.z * uniforms.M * uniforms.N;
+    if (m < uniforms.M && n < uniforms.N) {
+      let outputIdx = offset + m * uniforms.N + n;
+      output_q[outputIdx] = valueQ;
+      output_k[outputIdx] = valueK;
+      output_v[outputIdx] = valueV;
     }
   }`;
-
-  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  };
 
   return context.compute(
       {
         name: 'AttentionPrepare',
-        shaderCache: {hint: JSON.stringify(parameters)},
+        shaderCache: {inputDependencies: ['type', 'type', 'type']},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
@@ -619,6 +616,7 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
           ],
           dispatchGroup: dispatch,
+          programUniforms
         }),
         getShaderSource,
       },
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 3ce114c5d3884..bc3265be955f0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -780,8 +780,10 @@ class ShaderHelperImpl implements ShaderHelper {
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
     const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(local_invocation_id) local_id : vec3<u32>` :
-                                             `@builtin(local_invocation_index) local_idx : u32,
+                                             `@builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(num_workgroups) num_workgroups : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 14482272bad38..21b4953d3f90c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -3,9 +3,9 @@
 
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {ProgramInfo} from '../types';
+import {ProgramInfo, ProgramUniform} from '../types';
 
-import {inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
 import {calculateOutputShape, ConvAttributes} from './conv';
 import {getActivationSnippet} from './fuse-utils';
 
@@ -95,3 +95,98 @@ export const createGroupedConvProgramInfo =
         getShaderSource,
       };
     };
+
+export const createGroupedConvVectorizeProgramInfo =
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const components = getMaxComponents(outputShape[3]);
+      const outputNumber = getMaxComponents(outputShape[2]);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+      const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components];
+      const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components];
+      const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: outputSize}, {type: 'int32', data: attributes.strides},
+        {type: 'int32', data: attributes.pads}, ...createTensorShapeVariables(xShape),
+        ...createTensorShapeVariables(wShape), ...createTensorShapeVariables(outputShapeInShader)
+      ];
+      const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+        const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value);
+        const x = inputVariable('x', inputs[0].dataType, xShape.length, components);
+        const w = inputVariable('w', inputs[1].dataType, wShape.length, components);
+        const inputVars = [x, w];
+        if (hasBias) {
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components));
+        }
+        const processBias = hasBias ? 'value += b[output_channel];' : '';
+
+        return `
+  ${
+            shaderHelper.registerUniform('output_size', 'u32')
+                .registerUniform('strides', 'i32', 2)
+                .registerUniform('pads', 'i32', 2)
+                .declareVariables(...inputVars, output)}
+  ${activationFunction}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${outputNumber}u;
+    let col = (index1 % width1) * ${outputNumber}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${x.type.value}, ${xNumber}>;
+    var values: array<${output.type.value}, ${outputNumber}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 || u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${xNumber}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')};
+          } else {
+            x_vals[i] = ${x.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) {
+          let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')};
+          for (var i = 0u; i < ${outputNumber}u; i++) {
+            values[i] = fma(x_vals[i * ${attributes.strides[1]}u + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${outputNumber}u; i++) {
+      var value = values[i];
+      ${processBias}
+      ${applyActivation}
+      ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')};
+    }
+  }`;
+      };
+
+      return {
+        name: 'GroupedConv-Vectorize',
+        shaderCache: {
+          hint: `${attributes.activationCacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`,
+          inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
+        }),
+        getShaderSource,
+      };
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 33a5db7ff6b25..7af2c5db49f40 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -8,7 +8,7 @@ import {ComputeContext} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
 import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createGroupedConvProgramInfo} from './conv-grouped';
+import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
 import {createNaiveMatmulProgramInfo} from './matmul';
 import {createTransposeProgramInfo} from './transpose';
@@ -136,12 +136,36 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // check attributes
 
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+  const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+    // [webgpu]Conv - conv - vectorize group - B
+    // [webgpu]Conv - conv - vectorize group - D
+    const disableGroupedConvVectorize = true;
+    if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+        inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+      const outputShape = calculateOutputShape(
+          inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+          isChannelsLast);
+      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+          context.compute(
+              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+      if (attributes.wIsConst && !context.kernelCustomData.wT) {
+        context.kernelCustomData.wT = transposedWeight;
+      }
+      const convInputs = [inputs[0], transposedWeight];
+      if (inputs.length === 3) {
+        convInputs.push(inputs[2]);
+      }
+      context.compute(
+          createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
+    } else {
+      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    }
     return;
   }
 
-  const isChannelsLast = attributes.format === 'NHWC';
   const hasBias = inputs.length === 3;
   const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
   const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 1c5d28e4b8e3f..30754c84413b7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -3,10 +3,10 @@
 
 import {TensorView} from '../../tensor-view';
 import {GemmUtil, ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs) {
@@ -34,25 +34,6 @@ export interface GemmAttributes extends AttributeWithCacheKey {
   beta: number;
 }
 
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-  if (dims.length === 0) {
-    return '0u';
-  }
-
-  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-  const broadcastN = dims[dims.length - 1] !== n;
-
-  let offset = '0u';
-  if (!broadcastM) {
-    offset += `+ m * ${dims[dims.length - 1]}u`;
-  }
-  if (!broadcastN) {
-    offset += '+n';
-  }
-
-  return offset;
-};
-
 const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => {
   const aShape = inputs[0].dims.slice();
   const bShape = inputs[1].dims.slice();
@@ -63,68 +44,92 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
     throw new Error('Can\'t use gemm on the given tensors');
   }
   const outputSize = ShapeUtil.size(outputShape);
-  let line = '';
-  if (attributes.transA && attributes.transB) {
-    line = 'value += a[k * M + m] * b[n * K + k];';
-  } else if (attributes.transA && !attributes.transB) {
-    line = 'value += a[k * M + m] * b[k * N + n];';
-  } else if (!attributes.transA && attributes.transB) {
-    line = 'value += a[m * K + k] * b[n * K + k];';
-  } else if (!attributes.transA && !attributes.transB) {
-    line = 'value += a[m * K + k] * b[k * N + n];';
-  }
-
-  const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-  const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-  const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-  const inputStorageBuffersDeclarations = [
-    `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-    `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: outputSize}, {type: 'uint32', data: M}, {type: 'uint32', data: N}, {type: 'uint32', data: K},
+    {type: 'float32', data: attributes.alpha}, {type: 'float32', data: attributes.beta}
   ];
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
   if (inputs.length === 3) {
-    inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+    programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+    inputDependencies.push('rank');
   }
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha = ${dataType}(${attributes.alpha});
-  const beta = ${dataType}(${attributes.beta});
+  programUniforms.push(...createTensorShapeVariables(outputShape));
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    let line = '';
+    if (attributes.transA && attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[n * uniforms.K + k];';
+    } else if (attributes.transA && !attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[k * uniforms.N + n];';
+    } else if (!attributes.transA && attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[n * uniforms.K + k];';
+    } else if (!attributes.transA && !attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[k * uniforms.N + n];';
+    }
 
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+    const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= uniforms.alpha;';
+    const a = inputVariable('a', inputs[0].dataType, inputs[0].dims);
+    const b = inputVariable('b', inputs[1].dataType, inputs[1].dims);
+    const dataType = a.type.value;
+    let c: IndicesHelper|null = null;
+    const variables = [a, b];
+    if (inputs.length === 3) {
+      c = inputVariable('c', inputs[2].dataType, inputs[2].dims.length);
+      variables.push(c);
+    }
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    variables.push(output);
+    const uniforms: UniformsArrayType = [
+      {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'K', type: 'u32'},
+      {name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'}
+    ];
+    return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
 
-    let m = global_idx / N;
-    let n = global_idx % N;
+    let m = global_idx / uniforms.N;
+    let n = global_idx % uniforms.N;
 
     var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
+    for (var k: u32 = 0u; k < uniforms.K; k++) {
       ${line}
     }
 
     ${calculateAlpha}
-    ${calculateC}
+    ${(() => {
+      if (c != null) {
+        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${
+            c.getByOffset('cOffset')};`;
+      }
+      return '';
+    })()}
     output[global_idx] = value;
-
   }`;
+  };
+
   return {
     name: 'Gemm',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.cacheKey}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
     }),
     getShaderSource,
   };
 };
 
+export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes => {
+  const transA = attributes.transA as boolean;
+  const transB = attributes.transB as boolean;
+  const alpha = attributes.alpha as number;
+  const beta = attributes.beta as number;
+  return {transA, transB, alpha, beta, cacheKey: `${attributes.transA};${attributes.transB};${attributes.alpha === 1}`};
+};
+
 export const gemm = (context: ComputeContext, attributes: GemmAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createGemmProgramInfo(context.inputs, attributes));
 };
-
-export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes =>
-    createAttributeWithCacheKey(attributes as Omit<GemmAttributes, keyof AttributeWithCacheKey>);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 3a84844544c96..056dd54d54591 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -4,58 +4,56 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 
-export interface InstanceNormAttributes extends AttributeWithCacheKey {
+export interface InstanceNormAttributes {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const metadata = {
-  name: 'InstanceNormalization'
-};
-
 const createInstanceNormProgramInfo =
     (inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-
       const outputShape = xShape;
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const components = getMaxComponents(normSize);
       const normPackedSize = normSize / components;
-      const C = xShape[1];
-      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
-      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
-      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const variables = [x, scale, bias, output];
-      const dataType = x.type.value;
-      const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
-      const workgroupSize = 64;
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-
-  const C: u32 = ${C};
-  const normSize: u32 = ${normSize};
-  const epsilon: f32 = ${attributes.epsilon};
+      const inputShape = [xShape[0], xShape[1], normPackedSize];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'type', 'type'];
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: normSize}, {type: 'uint32', data: normPackedSize}];
+      programUniforms.push(...createTensorShapeVariables(inputShape), ...createTensorShapeVariables(inputShape));
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const x = inputVariable('x', inputs[0].dataType, inputShape.length, components);
+        const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+        const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+        const output = outputVariable('output', inputs[0].dataType, inputShape.length, components);
+        const variables = [x, scale, bias, output];
+        const dataType = x.type.value;
+        const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
+        const workgroupSize = 64;
+
+        const uniforms: UniformsArrayType = [{name: 'normSize', type: 'u32'}, {name: 'normPackedSize', type: 'u32'}];
+        return `
   var<workgroup> meanShared : f32;
   var<workgroup> squaredNormShared : f32;
   var<workgroup> workgroupShared : array<${f32Type}, ${workgroupSize}>;
   const workgroupSize = ${workgroupSize}u;
-  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart(workgroupSize)}
     let norm = global_idx / workgroupSize;
-    let batch = norm / C;
-    let channel = norm % C;
+    let batch = norm / uniforms.x_shape[1];
+    let channel = norm % uniforms.x_shape[1];
     let localIndex = local_id.x;
 
     // initialize workgroup memory
     var initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')});
     }
     workgroupShared[localIndex] = initial;
@@ -69,13 +67,13 @@ const createInstanceNormProgramInfo =
       workgroupBarrier();
     }
     if (localIndex == 0) {
-      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize);
+      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(uniforms.normSize);
     }
     workgroupBarrier();
 
     // reinitialize workgroup memory.
     initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let deviation =  ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared);
       initial = initial + deviation * deviation;
     }
@@ -94,23 +92,26 @@ const createInstanceNormProgramInfo =
     }
     workgroupBarrier();
 
-    let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon);
+    let invStdDev = 1 / sqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
     let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
     let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${
-          f32Type}(channelShift));
+            f32Type}(channelShift));
       ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
+      };
       return {
-        ...metadata,
-        shaderCache: {hint: attributes.cacheKey},
+        ...{name: 'InstanceNormalization'},
+        // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+        shaderCache: {hint: `${attributes.epsilon};${components}`, inputDependencies},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: inputs[0].dataType},
           ],
-          dispatchGroup: {x: normCount}
+          dispatchGroup: {x: normCount},
+          programUniforms
         }),
         getShaderSource,
       };
@@ -120,10 +121,6 @@ const computeMean =
     (context: ComputeContext, input: TensorView, scale: TensorView, bias: TensorView, n: number, h: number, c: number,
      epsilon: number) => {
       const components = getMaxComponents(c);
-      const inputHelper = inputVariable('input', input.dataType, input.dims, components);
-      const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
-      const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
-
       const WG = 64;
       // we will store channel scale and channel shift in [2, components] matrix
       // or in vec2 when components == 1
@@ -133,65 +130,79 @@ const computeMean =
       const unitsOfWork = n * c / components;
       const wgSize = Math.ceil(h / WG);
 
-      const getMeanShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${h * c / components};
+      const meanInputDependencies: ProgramInputTensorInfoDependency[] = ['type'];
+      const meanProgramUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: wgSize}, {type: 'uint32', data: h}, {type: 'uint32', data: Math.floor(c / components)},
+        {type: 'uint32', data: Math.floor(h * c / components)}
+      ];
 
+      const getMeanShaderSource = (shaderHelper: ShaderHelper) => {
+        const inputHelper = inputVariable('input', input.dataType, input.dims, components);
+        return `
   ${shaderHelper.declareVariables(inputHelper)}
   @group(0) @binding(1) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {wg_size:u32, H:u32, C:u32, image_size:u32};
+  @group(0) @binding(2) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart(WG)}
-    let currentImageNumber = global_idx / ${WG} / C;
-    let currentChannelNumber = (global_idx / ${WG}) % C;
+    let currentImageNumber = global_idx / ${WG} / uniforms.C;
+    let currentChannelNumber = (global_idx / ${WG}) % uniforms.C;
     let wgId = global_idx % ${WG};
-    let wgOffset = wgId * ${wgSize};
-    if (wgOffset >= H) {
+    let wgOffset = wgId * uniforms.wg_size;
+    if (wgOffset >= uniforms.H) {
         return;
     }
-    let wgMax = min(wgOffset + ${wgSize}, H);
+    let wgMax = min(wgOffset + uniforms.wg_size, uniforms.H);
 
-    let offset = currentImageNumber * imageSize + currentChannelNumber;
+    let offset = currentImageNumber * uniforms.image_size + currentChannelNumber;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
     for (var i: u32 = wgOffset; i < wgMax; i++) {
-        let value = ${sumCastType}(input[offset + i * C]);
+        let value = ${sumCastType}(input[offset + i * uniforms.C]);
         sum += value;
         squaredSum += value * value;
     }
     output[global_idx] = ${setOutputValue('sum', 'squaredSum')};
   }`;
+      };
 
       const meanValues = context.compute(
           {
             name: 'InstanceNormComputeMean',
-            shaderCache: {hint: JSON.stringify({components, n, h, c})},
+            shaderCache: {hint: `${components}`, inputDependencies: meanInputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, WG, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: n * c / components},
+              programUniforms: meanProgramUniforms
             }),
             getShaderSource: getMeanShaderSource,
           },
           {inputs: [input], outputs: [-1]})[0];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${WG * c / components};
-  const epsilon: f32 = ${epsilon};
 
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: unitsOfWork}, {type: 'uint32', data: h},
+        {type: 'uint32', data: Math.floor(c / components)}, {type: 'uint32', data: Math.floor(WG * c / components)}
+      ];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type', 'type'];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
+        const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${outputType}>;
   @group(0) @binding(1) var<storage, read> scale : array<${scaleHelper.type.storage}>;
   @group(0) @binding(2) var<storage, read> bias : array<${biasHelper.type.storage}>;
   @group(0) @binding(3) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {units_of_work : u32, H: u32, C : u32, image_size : u32};
+  @group(0) @binding(4) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(unitsOfWork)}
-    let currentImageNumber = global_idx / C;
-    let currentChannelNumber = global_idx % C;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.units_of_work')}
+    let currentImageNumber = global_idx / uniforms.C;
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let offset = currentImageNumber * imageSize;
+    let offset = currentImageNumber * uniforms.image_size;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
     for (var i: u32 = 0; i < ${WG}; i++) {
@@ -199,24 +210,26 @@ const computeMean =
         sum += value[0];
         squaredSum += value[1];
     }
-    sum = sum / f32(H);
-    squaredSum = squaredSum / f32(H);
-    let invStdDev = 1 / sqrt(squaredSum - sum * sum + epsilon);
+    sum = sum / f32(uniforms.H);
+    squaredSum = squaredSum / f32(uniforms.H);
+    let invStdDev = 1 / sqrt(squaredSum - sum * sum + f32(${epsilon}));
     let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]);
     let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale;
 
     output[global_idx] = ${setOutputValue('channelScale', 'channelShift')};
   }`;
-
+      };
       return context.compute(
           {
             name: 'InstanceNormComputeChannelScaleShift',
-            shaderCache: {hint: JSON.stringify({components, n, h, c, epsilon})},
+            // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+            shaderCache: {hint: `${components};${epsilon}`, inputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: Math.ceil(unitsOfWork / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
@@ -230,50 +243,51 @@ const createInstanceNormNHWCProgramInfo =
       const N = xShape[0];
       const C = xShape[xShape.length - 1];
       const H = ShapeUtil.sizeFromDimension(xShape, 1) / C;
-
       const components = getMaxComponents(C);
       const outputSize = ShapeUtil.size(outputShape) / components;
-      const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
-      const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
-      const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: H}, {type: 'uint32', data: Math.floor(C / components)}];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
       // first compute mean
       const channelScaleShift = computeMean(context, inputs[0], inputs[1], inputs[2], N, H, C, attributes.epsilon);
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
+        const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${H};
-  const C: u32 = ${C / components};
+        const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
+        const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
 
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${inputHelper.type.storage}>;
   @group(0) @binding(1) var<storage, read> scaleInput : array<${scaleType}>;
   @group(0) @binding(2) var<storage, read_write> output : array<${outputHelper.type.storage}>;
+  struct Uniforms {H: u32, C : u32};
+  @group(0) @binding(3) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    let currentImageNumber = global_idx / (C * H);
-    let currentChannelNumber = global_idx % C;
+    let currentImageNumber = global_idx / (uniforms.C * uniforms.H);
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let scaleOffset = currentImageNumber * C + currentChannelNumber;
+    let scaleOffset = currentImageNumber * uniforms.C + currentChannelNumber;
     let scale = scaleInput[scaleOffset];
     output[global_idx] = fma(input[global_idx], ${scaleCastType}(scale[0]), ${scaleCastType}(scale[1]));
   }`;
+      };
       context.compute(
           {
-            name: 'InstanceNormalization',
-            shaderCache: {hint: `${attributes.cacheKey}`},
+            name: 'InstanceNormalizationNHWC',
+            shaderCache: {hint: `${components}`, inputDependencies},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
           {inputs: [inputs[0], channelScaleShift]});
     };
 
-export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): InstanceNormAttributes =>
-    createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
-
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
   if (attributes.format === 'NHWC') {
     createInstanceNormNHWCProgramInfo(context, context.inputs, attributes);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9eeecf2c68d..bc446079faf8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -4,12 +4,11 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common';
 
-export interface LayerNormAttributes extends AttributeWithCacheKey {
+interface LayerNormAttributes {
   axis: number;
   epsilon: number;
 }
@@ -39,7 +38,7 @@ const createLayerNormProgramInfo =
        Got scale size of ${scaleSize} and bias size of ${biasSize}`);
       }
 
-      const meanInvStdDevDim = [];
+      const meanInvStdDevDim: number[] = [];
       for (let i = 0; i < xShape.length; ++i) {
         if (i < axis) {
           meanInvStdDevDim.push(xShape[i]);
@@ -47,50 +46,57 @@ const createLayerNormProgramInfo =
           meanInvStdDevDim.push(1);
         }
       }
-
       const components = getMaxComponents(normSize);
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const variables = [
-        inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
-        inputVariable('scale', scale.dataType, scale.dims, components),
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: normCount}, {type: 'float32', data: normSize},
+        {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon}
       ];
       if (bias) {
-        variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        inputDependencies.push('type');
       }
-      variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-
       const hasMeanDataOutput = outputCount > 1;
       const hasInvStdOutput = outputCount > 2;
 
-      if (hasMeanDataOutput) {
-        variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim));
-      }
-      if (hasInvStdOutput) {
-        variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
-      }
-
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const normSize: f32 = ${normSize};
-  const normSizeVectorized: u32 = ${normSize / components};
-  const epsilon: f32 = ${attributes.epsilon};
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const variables = [
+          inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+          inputVariable('scale', scale.dataType, scale.dims, components),
+        ];
+        if (bias) {
+          variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        }
+        variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+        if (hasMeanDataOutput) {
+          variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim));
+        }
+        if (hasInvStdOutput) {
+          variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+        }
 
-  ${shaderHelper.declareVariables(...variables)}
+        const uniforms: UniformsArrayType = [
+          {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'},
+          {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)}
-    let offset = global_idx * normSizeVectorized;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
+    let offset = global_idx * uniforms.norm_size_vectorized;
     var meanVector = ${fillVector('f32', components)};
     var meanSquareVector = ${fillVector('f32', components)};
 
-    for (var h: u32 = 0u; h < normSizeVectorized; h++) {
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
       let value = ${castToF32(dataType, components, 'x[h + offset]')};
       meanVector += value;
       meanSquareVector += value * value;
     }
-    let mean = ${sumVector('meanVector', components)} / normSize;
-    let meanSquare = sqrt(${sumVector('meanSquareVector', components)} 
-      / normSize - mean * mean + epsilon);
+    let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
+    let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
+      / uniforms.norm_size - mean * mean + uniforms.epsilon);
 
-    for (var j: u32 = 0; j < normSizeVectorized; j++) {
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
       output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
@@ -98,9 +104,10 @@ const createLayerNormProgramInfo =
       );
     }
 
-    ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''};
+    ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''};
   }`;
+      };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
       if (hasMeanDataOutput) {
         outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -111,15 +118,13 @@ const createLayerNormProgramInfo =
 
       return {
         name: 'LayerNormalization',
-        shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`},
-        getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}),
+        shaderCache: {hint: `${components};${outputCount}`, inputDependencies},
+        getRunData: () =>
+            ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}),
         getShaderSource,
       };
     };
 
-export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes =>
-    createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon});
-
 export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount));
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
index b7726a36bcaad..6d22e3780efd9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
@@ -4,10 +4,10 @@
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
 import {applyAttention, AttentionAttrs, AttentionMaskType, AttentionParameters, AttentionQkvFormat} from './attention';
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 import {createTransposeProgramInfo, TransposeAttributes} from './transpose';
 
 const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttrs): AttentionParameters => {
@@ -228,7 +228,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttr
   };
 };
 
-
 export const parseMultiHeadAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
     createAttributeWithCacheKey({...attributes});
 
@@ -239,30 +238,35 @@ const addBiasTranspose =
      hiddenSize: number, biasOffset: number) => {
       const outputShape = [batchSize, sequenceLength, hiddenSize];
       const outputSize = ShapeUtil.size(outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(qkv.dataType);
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const biasOffset = ${biasOffset}u;
-  const hiddenSize = ${hiddenSize}u;
-
-  @group(0) @binding(0) var<storage, read> qkv: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read_write> qkv_with_bias: array<${dataType}>;
-
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: outputSize}, {type: 'uint32', data: biasOffset}, {type: 'uint32', data: hiddenSize}];
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('qkv_with_bias', qkv.dataType, outputShape);
+        const qkvInput = inputVariable('qkv', qkv.dataType, outputShape);
+        const biasInput = inputVariable('bias', bias.dataType, outputShape);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'bias_offset', type: 'u32'}, {name: 'hidden_size', type: 'u32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qkvInput, biasInput, output)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-    let biasOffsetIdx = (global_idx % hiddenSize) + biasOffset;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
 
-    qkv_with_bias[global_idx] = qkv[global_idx] + bias[biasOffsetIdx];
+    qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
   }`;
+      };
 
       return context.compute(
           {
             name: 'MultiHeadAttentionAddBias',
-            shaderCache: {hint: JSON.stringify({batchSize, sequenceLength, hiddenSize, biasOffset})},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: qkv.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index 18859e253aa02..eca3fa7d944bb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -1,15 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getElementAt, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
-export interface PadAttributes extends AttributeWithCacheKey {
+interface PadAttributes {
   // 0-constant, 1-reflect, 2-edge, 3-wrap
   readonly mode: number;
   readonly value: number;
@@ -35,27 +34,23 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   }
 };
 
-const getPadConstant =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[],
-     dataType: string, constantValue: number): string => {
-      const inputRank = inputDims.length;
-
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-            k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadConstant = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+            k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
             if (k < 0) {
               break;
             }
-            if (k >= ${inputDims[i]}) {
+            if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
               break;
             }
-            offset += k * ${inputStrides[i]};
+            offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
         `;
-      }
+  }
 
-      return `
-          value = ${dataType}(${constantValue});
+  return `
+          value = ${output.type.value}(uniforms.constant_value);
           for (var i = 0; i < 1; i++) {
             var offset = 0;
             var k = 0;
@@ -63,143 +58,143 @@ const getPadConstant =
             value = x[offset];
           }
       `;
-    };
-
-const getPadReflect =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadReflect = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = -k;
                 }
                 {
-                  let _2n_1 = ${2 * (inputDims[i] - 1)};
+                  let _2n_1 = 2 * (i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1);
                   k = k % _2n_1;
-                  if(k >= ${inputDims[i]}) {
+                  if(k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
                     k = _2n_1 - k;
                   }
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadEdge =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadEdge = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = 0;
                 }
-                if (k >= ${inputDims[i]}) {
-                  k = ${inputDims[i] - 1};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k = i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1;
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadWrap =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadWrap = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0)  {
-                  k += ${inputDims[i]};
+                  k += i32(${getElementAt('uniforms.x_shape', i, inputRank)}]);
                 }
-                if (k >= ${inputDims[i]}) {
-                  k -= ${inputDims[i]};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k -= i32(${getElementAt('uniforms.x_shape', i, inputRank)});
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadSnippet =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], attributes: PadAttributes,
-     dataType: string): string => {
-      switch (attributes.mode) {
-        case 0:
-          return getPadConstant(output, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
-        case 1:
-          return getPadReflect(output, inputDims, inputStrides, attributes.pads);
-        case 2:
-          return getPadEdge(output, inputDims, inputStrides, attributes.pads);
-        case 3:
-          return getPadWrap(output, inputDims, inputStrides, attributes.pads);
-        default:
-          throw new Error('Invalid mode');
-      }
-    };
-
-const generatePadCode =
-    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
-        string => {
-          const inputDims = inputs[0].dims;
-          const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
-          const outputSize = ShapeUtil.size(outputDims);
-          const inputStrides = ShapeUtil.computeStrides(inputDims);
-
-          const output = outputVariable('output', inputs[0].dataType, outputDims);
-          const input = inputVariable('x', inputs[0].dataType, inputDims);
-
-          const padSnippet = getPadSnippet(output, inputDims, inputStrides, attributes, dataType);
-          const padCode = `
-              ${shaderHelper.declareVariables(input, output)}
-              ${shaderHelper.mainStart()}
-              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-
-              let indices = ${output.offsetToIndices('global_idx')};
-
-              var value = ${dataType}(0);
-              ${padSnippet}
-              output[global_idx] = value;
-          }`;
-          return padCode;
-        };
+};
+
+const getPadSnippet = (output: IndicesHelper, inputRank: number, attributes: PadAttributes): string => {
+  switch (attributes.mode) {
+    case 0:
+      return getPadConstant(output, inputRank, attributes.pads.length);
+    case 1:
+      return getPadReflect(output, inputRank, attributes.pads.length);
+    case 2:
+      return getPadEdge(output, inputRank, attributes.pads.length);
+    case 3:
+      return getPadWrap(output, inputRank, attributes.pads.length);
+    default:
+      throw new Error('Invalid mode');
+  }
+};
 
 const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfo => {
   const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+  const inputDims = inputs[0].dims;
+  const outputSize = ShapeUtil.size(outputShape);
+  const programUniforms: ProgramUniform[] =
+      [{type: 'uint32', data: outputSize}, {type: 'uint32', data: attributes.pads}];
+  if (attributes.mode === 0) {
+    const tensorDataType = tensorDataTypeEnumToString(inputs[0].dataType) as ProgramUniform['type'];
+    programUniforms.push({type: tensorDataType, data: attributes.value});
+  }
+
+  programUniforms.push(...createTensorShapeVariables(inputs[0].dims), ...createTensorShapeVariables(outputShape));
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    const input = inputVariable('x', inputs[0].dataType, inputDims.length);
+    const dataType = input.type.value;
+    const padSnippet = getPadSnippet(output, inputDims.length, attributes);
+    const uniforms: UniformsArrayType =
+        [{name: 'output_size', type: 'u32'}, {name: 'pads', type: 'i32', length: attributes.pads.length}];
+    if (attributes.mode === 0) {
+      uniforms.push({name: 'constant_value', type: dataType as UniformDataElementType});
+    }
+
+    return `
+            ${shaderHelper.registerUniforms(uniforms).declareVariables(input, output)}
+            ${shaderHelper.mainStart()}
+            ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+
+            let indices = ${output.offsetToIndices('global_idx')};
+
+            var value = ${dataType}(0);
+            ${padSnippet}
+            output[global_idx] = value;
+        }`;
+  };
+
   return {
     name: 'Pad',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.mode}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)},
+      programUniforms
     }),
-    getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+    getShaderSource,
   };
 };
 
@@ -223,7 +218,7 @@ const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes
     const pads: number[] = [];
     updatePads.forEach(v => pads.push(v));
 
-    return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+    return {mode: attributes.mode, value, pads};
   } else {
     return attributes;
   }
@@ -234,10 +229,3 @@ export const pad = (context: ComputeContext, attributes: PadAttributes): void =>
   const updatedAttributes = createPadAttributesFromInputs(context.inputs, attributes);
   context.compute(createPadProgramInfo(context.inputs, updatedAttributes), {inputs: [0]});
 };
-
-export const parsePadAttributes = (attributes: Record<string, unknown>): PadAttributes => {
-  const mode = attributes.mode as number;
-  const value = attributes.value as number;
-  const pads = attributes.pads as number[];
-  return createAttributeWithCacheKey({mode, value, pads});
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/range.ts b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
index 9cf66111bf707..ed04b0f94bc57 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/range.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
@@ -3,10 +3,10 @@
 
 import {env} from 'onnxruntime-common';
 
-import {DataType} from '../../../wasm-common';
-import {ComputeContext, ProgramInfo} from '../types';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
 const validateInputsContent = (start: number, limit: number, delta: number): void => {
   const sameStartLimit = start === limit;
@@ -22,23 +22,36 @@ const createRangeProgramInfo = (start: number, limit: number, delta: number, dat
   const numElements = Math.abs(Math.ceil((limit - start) / delta));
   const outputShape: number[] = [numElements];
   const outputSize = numElements;
+  const tensorDataType = tensorDataTypeEnumToString(dataType) as ProgramUniform['type'];
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: outputSize}, {type: tensorDataType, data: start}, {type: tensorDataType, data: delta},
+    ...createTensorShapeVariables(outputShape)
+  ];
 
-  const output = outputVariable('output', dataType, outputShape);
-  const wgslType = output.type.storage;
-
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-        ${shaderHelper.declareVariables(output)}
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', dataType, outputShape.length);
+    const wgslType = output.type.value;
+    const uniforms: UniformsArrayType = [
+      {name: 'outputSize', type: 'u32'}, {name: 'start', type: wgslType as UniformDataElementType},
+      {name: 'delta', type: wgslType as UniformDataElementType}
+    ];
+    return `
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(output)}
         ${shaderHelper.mainStart()}
-        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-        output[global_idx] = ${wgslType}(${start}) + ${wgslType}(global_idx) * ${wgslType}(${delta});
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')}
+        output[global_idx] = uniforms.start + ${wgslType}(global_idx) * uniforms.delta;
       }`;
+  };
+
   return {
     name: 'Range',
-    shaderCache: {hint: [start, limit, delta].map(x => x.toString()).join('_')},
+    shaderCache: {hint: `${dataType}`},
     getShaderSource,
-    getRunData: () => (
-        {outputs: [{dims: outputShape, dataType}],
-         dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}})
+    getRunData: () => ({
+      outputs: [{dims: outputShape, dataType}],
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
+    })
   };
 };
 
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 5151f27582c1f..a52ac4454a5c1 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -44,6 +44,7 @@ const SOURCE_ROOT_FOLDER = path.join(__dirname, '../..');  // <ORT_ROOT>/js/
 const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WEBGL': 'false',
   'BUILD_DEFS.DISABLE_WEBGPU': 'false',
+  'BUILD_DEFS.DISABLE_WEBNN': 'false',
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.DISABLE_WASM_THREAD': 'false',
@@ -359,6 +360,7 @@ async function main() {
         ...DEFAULT_DEFINE,
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
         'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
         'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
       },
@@ -367,10 +369,7 @@ async function main() {
 
   if (BUNDLE_MODE === 'dev') {
     // ort.all.js
-    await addBuildTask(buildOrt({
-      outputBundleName: 'ort.all',
-      format: 'iife',
-    }));
+    await addBuildTask(buildOrt({outputBundleName: 'ort.all', format: 'iife', define: {...DEFAULT_DEFINE}}));
   }
 
   if (BUNDLE_MODE === 'perf') {
@@ -394,7 +393,7 @@ async function main() {
     // ort.webgpu[.min].js
     await addAllWebBuildTasks({
       outputBundleName: 'ort.webgpu',
-      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'},
+      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_WEBNN': 'true'},
     });
     // ort.wasm[.min].js
     await addAllWebBuildTasks({
@@ -404,7 +403,12 @@ async function main() {
     // ort.webgl[.min].js
     await addAllWebBuildTasks({
       outputBundleName: 'ort.webgl',
-      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WASM': 'true'},
+      define: {
+        ...DEFAULT_DEFINE,
+        'BUILD_DEFS.DISABLE_WEBGPU': 'true',
+        'BUILD_DEFS.DISABLE_WASM': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
+      },
     });
     // ort.wasm-core[.min].js
     await addAllWebBuildTasks({
@@ -413,6 +417,7 @@ async function main() {
         ...DEFAULT_DEFINE,
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
         'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
         'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
       },
@@ -425,6 +430,7 @@ async function main() {
         'BUILD_DEFS.DISABLE_TRAINING': 'false',
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
       },
     });
   }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index ee955ec8d4f17..fc74adfed1fee 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -79,6 +79,7 @@ Options:
  --webgl-texture-cache-mode    Set the WebGL texture cache mode (initializerOnly/full)
  --webgl-texture-pack-mode     Set the WebGL texture pack mode (true/false)
  --webgpu-profiling-mode       Set the WebGPU profiling mode (off/default)
+ --webnn-device-type           Set the WebNN device type (cpu/gpu)
 
 *** Browser Options ***
 
@@ -174,6 +175,7 @@ export interface TestRunnerCliArgs {
   cudaFlags?: Record<string, unknown>;
   wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+  webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
   chromiumFlags: string[];
@@ -335,6 +337,14 @@ function parseWebgpuFlags(args: minimist.ParsedArgs): Partial<Env.WebGpuFlags> {
   return {profilingMode, validateInputContent};
 }
 
+function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption {
+  const deviceType = args['webnn-device-type'];
+  if (deviceType !== undefined && deviceType !== 'cpu' && deviceType !== 'gpu') {
+    throw new Error('Flag "webnn-device-type" is invalid');
+  }
+  return {name: 'webnn', deviceType};
+}
+
 function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable<TestRunnerCliArgs['globalEnvFlags']> {
   const wasm = parseWasmFlags(args);
   const webgl = parseWebglFlags(args);
@@ -449,6 +459,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   const wasmOptions = parseWasmOptions(args);
 
   const webglOptions = parseWebglOptions(args);
+  const webnnOptions = parseWebNNOptions(args);
 
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
@@ -487,6 +498,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     fileCache,
     cpuOptions,
     webglOptions,
+    webnnOptions,
     wasmOptions,
     globalEnvFlags,
     noSandbox,
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 74a03290332a8..d56792c6e3595 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -165,6 +165,7 @@ async function main() {
       debug: args.debug,
       cpuOptions: args.cpuOptions,
       webglOptions: args.webglOptions,
+      webnnOptions: args.webnnOptions,
       wasmOptions: args.wasmOptions,
       globalEnvFlags: args.globalEnvFlags
     }
@@ -499,7 +500,7 @@ async function main() {
           args.bundleMode === 'perf' ? 'perf' :
               args.debug             ? 'debug' :
                                        'test',
-          webgpu, webnn);
+          webgpu);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
       const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
       if (args.debug) {
@@ -614,11 +615,10 @@ async function main() {
     fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
   }
 
-  function getBrowserNameFromEnv(
-      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+  function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu, webnn);
+        return selectChromeBrowser(mode, webgpu);
       case 'edge':
         return 'EdgeTest';
       case 'firefox':
@@ -634,10 +634,8 @@ async function main() {
     }
   }
 
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
-    if (webnn) {
-      return 'ChromeCanaryTest';
-    } else if (webgpu) {
+  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
+    if (webgpu) {
       return 'ChromeTest';
     } else {
       switch (mode) {
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 2e8eaaba191d0..cc10df5864233 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -298,7 +298,157 @@
       }
     ]
   },
-
+  {
+    "name": "conv - vectorize group - A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+      { "name": "group", "data": 2, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0],
+            "dims": [2, 1, 1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - C",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374],
+            "dims": [1, 3, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index 6a4e6912405ee..e89ac2da3795f 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -38,6 +38,79 @@
       }
     ]
   },
+  {
+    "name": "Simple test with NHWC, components 1",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "Simple test with NCHW",
     "operator": "InstanceNormalization",
@@ -75,5 +148,81 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Simple test with NCHW, components 1",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW, components 2",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.5361523628234863, 3.1216912269592285, 3.70723032951355, 4.292769432067871, 4.878308296203613,
+              5.4638471603393555, 1.8666191101074219, 3.9555397033691406, 6.044460296630859, 8.133380889892578,
+              6.044460296630859, 3.9555397033691406, 10.3915433883667, 8.634925842285156, 6.878308296203613,
+              5.121691703796387, 3.365074634552002, 1.6084575653076172
+            ],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 594ce9feed31e..79f42e36bf390 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1501,85 +1501,424 @@
   "webnn": {
     "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
     "node": [
-      // Check in node tests that have native Wasm implementations.
-      // (i.e.) not tests that rely on the fallback cpu implementations.
-      // Use the 'cpu' level of node tests to test those implementations.
+      "test_abs",
+      "test_acos_example",
+      "test_acos",
+      "test_acosh_example",
+      "test_acosh",
+      // // "test_adagrad_multiple",
+      // // "test_adagrad",
+      // // "test_adam_multiple",
+      // // "test_adam",
       "test_add_bcast",
+      // "test_add_uint8",
       "test_add",
-      "test_sub_bcast",
-      "test_sub_example",
-      "test_sub",
-      "test_mul_bcast",
-      "test_mul_example",
-      "test_mul",
-      "test_div_bcast",
-      "test_div_example",
-      "test_div",
-      "test_xor_bcast3v1d",
-      "test_xor_bcast3v2d",
-      "test_xor_bcast4v2d",
-      "test_xor_bcast4v3d",
-      "test_xor_bcast4v4d",
-      "test_xor2d",
-      "test_xor3d",
-      "test_xor4d",
-      "test_or_bcast3v1d",
-      "test_or_bcast3v2d",
-      "test_or_bcast4v2d",
-      "test_or_bcast4v3d",
-      "test_or_bcast4v4d",
-      "test_and_bcast3v1d",
-      "test_and_bcast3v2d",
-      "test_and_bcast4v2d",
-      "test_and_bcast4v3d",
-      "test_and_bcast4v4d",
-      "test_and2d",
-      "test_and3d",
-      "test_and4d",
-      "test_prelu_broadcast",
-      "test_prelu_example",
+      // "test_and_bcast3v1d",
+      // "test_and_bcast3v2d",
+      // "test_and_bcast4v2d",
+      // "test_and_bcast4v3d",
+      // "test_and_bcast4v4d",
+      // "test_and2d",
+      // "test_and3d",
+      // "test_and4d",
+      // "test_argmax_default_axis_example_select_last_index",
+      // "test_argmax_default_axis_example",
+      // "test_argmax_default_axis_random_select_last_index",
+      // "test_argmax_default_axis_random",
+      // "test_argmax_keepdims_example_select_last_index",
+      // "test_argmax_keepdims_example",
+      // "test_argmax_keepdims_random_select_last_index",
+      // "test_argmax_keepdims_random",
+      // "test_argmax_negative_axis_keepdims_example_select_last_index",
+      // "test_argmax_negative_axis_keepdims_example",
+      // "test_argmax_negative_axis_keepdims_random_select_last_index",
+      // "test_argmax_negative_axis_keepdims_random",
+      // "test_argmax_no_keepdims_example_select_last_index",
+      // "test_argmax_no_keepdims_example",
+      // "test_argmax_no_keepdims_random_select_last_index",
+      // "test_argmax_no_keepdims_random",
+      // "test_argmin_default_axis_example_select_last_index",
+      // "test_argmin_default_axis_example",
+      // "test_argmin_default_axis_random_select_last_index",
+      // "test_argmin_default_axis_random",
+      // "test_argmin_keepdims_example_select_last_index",
+      // "test_argmin_keepdims_example",
+      // "test_argmin_keepdims_random_select_last_index",
+      // "test_argmin_keepdims_random",
+      // "test_argmin_negative_axis_keepdims_example_select_last_index",
+      // "test_argmin_negative_axis_keepdims_example",
+      // "test_argmin_negative_axis_keepdims_random_select_last_index",
+      // "test_argmin_negative_axis_keepdims_random",
+      // "test_argmin_no_keepdims_example_select_last_index",
+      // "test_argmin_no_keepdims_example",
+      // "test_argmin_no_keepdims_random_select_last_index",
+      // "test_argmin_no_keepdims_random",
+      // "test_asin_example",
+      // "test_asin",
+      // "test_asinh_example",
+      // "test_asinh",
+      // "test_atan_example",
+      // "test_atan",
+      // "test_atanh_example",
+      // "test_atanh",
+      // "test_averagepool_1d_default",
+      // "test_averagepool_2d_ceil",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads_count_include_pad",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads_count_include_pad",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_strides",
+      // "test_averagepool_3d_default",
       "test_basic_conv_with_padding",
       "test_basic_conv_without_padding",
-      "test_batchnorm_epsilon",
-      "test_batchnorm_example",
-      "opset{10,11,12}/test_cast_STRING_to_FLOAT",
-      "test_clip_splitbounds",
-      "test_clip_outbounds",
-      "test_clip_inbounds",
-      "test_clip_example",
-      "test_clip_default_min",
-      "test_clip_default_max",
+      // "test_basic_convinteger",
+      // "test_batchnorm_epsilon_training_mode",
+      // "test_batchnorm_epsilon",
+      // "test_batchnorm_example_training_mode",
+      // "test_batchnorm_example",
+      // // "test_bernoulli_double_expanded",
+      // // "test_bernoulli_double",
+      // // "test_bernoulli_expanded",
+      // // "test_bernoulli_seed_expanded",
+      // // "test_bernoulli_seed",
+      // // "test_bernoulli",
+      // // "test_bitshift_left_uint16",
+      // // "test_bitshift_left_uint32",
+      // // "test_bitshift_left_uint64",
+      // // "test_bitshift_left_uint8",
+      // // "test_bitshift_right_uint16",
+      // // "test_bitshift_right_uint32",
+      // // "test_bitshift_right_uint64",
+      // // "test_bitshift_right_uint8",
+      // // "test_blackmanwindow_expanded",
+      // // "test_blackmanwindow_symmetric_expanded",
+      // // "test_blackmanwindow_symmetric",
+      // // "test_blackmanwindow",
+      // // "test_cast_BFLOAT16_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT16",
+      // // "test_cast_FLOAT_to_BFLOAT16",
+      // // "test_cast_FLOAT_to_DOUBLE",
+      // // "test_cast_FLOAT_to_FLOAT16",
+      // // "test_cast_FLOAT_to_STRING",
+      // // "test_cast_FLOAT16_to_DOUBLE",
+      // // "test_cast_FLOAT16_to_FLOAT",
+      // // "test_cast_STRING_to_FLOAT",
+      // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_BFLOAT16_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT16_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_BFLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_BFLOAT16",
+      // // "test_castlike_FLOAT_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT_to_DOUBLE",
+      // // "test_castlike_FLOAT_to_FLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_STRING_expanded",
+      // // "test_castlike_FLOAT_to_STRING",
+      // // "test_castlike_FLOAT16_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT16_to_DOUBLE",
+      // // "test_castlike_FLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_FLOAT16_to_FLOAT",
+      // // "test_castlike_STRING_to_FLOAT_expanded",
+      // // "test_castlike_STRING_to_FLOAT",
+      "test_ceil_example",
+      "test_ceil",
+      // "test_celu_expanded",
+      // "test_celu",
       "test_clip_default_inbounds",
+      "test_clip_default_int8_inbounds",
+      "test_clip_default_int8_max",
+      "test_clip_default_int8_min",
+      "test_clip_default_max",
+      "test_clip_default_min",
+      "test_clip_example",
+      "test_clip_inbounds",
+      "test_clip_outbounds",
+      "test_clip_splitbounds",
       "test_clip",
+      // // "test_compress_0",
+      // // "test_compress_1",
+      // // "test_compress_default_axis",
+      // // "test_compress_negative_axis",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
+      "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
       "test_conv_with_strides_padding",
-      "test_gemm_nobroadcast",
+      // // "test_convinteger_with_padding",
+      // // "test_convinteger_without_padding",
+      // "test_convtranspose_1d",
+      // // "test_convtranspose_3d",
+      // "test_convtranspose_autopad_same",
+      "test_convtranspose_dilations",
+      "test_convtranspose_kernel_shape",
+      "opset{9,17}/test_convtranspose_output_shape",
+      "test_convtranspose_pad",
+      "test_convtranspose_pads",
+      "test_convtranspose_with_kernel",
+      "test_convtranspose",
+      "test_cos_example",
+      "test_cos",
+      // "test_cosh_example",
+      // "test_cosh",
+      // "test_cumsum_1d_exclusive",
+      // "test_cumsum_1d_reverse_exclusive",
+      // "test_cumsum_1d_reverse",
+      // "test_cumsum_1d",
+      // "test_cumsum_2d_axis_0",
+      // "test_cumsum_2d_axis_1",
+      // "test_cumsum_2d_negative_axis",
+      // "test_depthtospace_crd_mode_example",
+      // "test_depthtospace_crd_mode",
+      // "test_depthtospace_dcr_mode",
+      // "test_depthtospace_example",
+      // "test_depthtospace",
+      // // "test_dequantizelinear_axis",
+      // // "test_dequantizelinear",
+      // // "test_det_2d",
+      // // "test_det_nd",
+      // // "test_dft_axis",
+      // // "test_dft_inverse",
+      // // "test_dft",
+      "test_div_bcast",
+      "test_div_example",
+      // "test_div_uint8",
+      "test_div",
+      // // "test_dropout_default_mask_ratio",
+      // // "test_dropout_default_mask",
+      // // "test_dropout_default_old",
+      // // "test_dropout_default_ratio",
+      // // "test_dropout_default",
+      // // "test_dropout_random_old",
+      // // "test_dropout_random",
+      // // "test_dynamic_slice_default_axes",
+      // // "test_dynamic_slice_end_out_of_bounds",
+      // // "test_dynamic_slice_neg",
+      // // "test_dynamic_slice_start_out_of_bounds",
+      // // "test_dynamic_slice",
+      // // "test_dynamicquantizelinear_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted",
+      // // "test_dynamicquantizelinear_min_adjusted_expanded",
+      // // "test_dynamicquantizelinear_min_adjusted",
+      // // "test_dynamicquantizelinear",
+      // "test_edge_pad",
+      // "test_einsum_batch_diagonal",
+      // "test_einsum_batch_matmul",
+      // "test_einsum_inner_prod",
+      // "test_einsum_sum",
+      // "test_einsum_transpose",
+      "test_elu_default",
+      "test_elu_example",
+      "test_elu",
+      // "test_equal_bcast",
+      // "test_equal",
+      // "test_erf",
+      "test_exp_example",
+      "test_exp",
+      // "test_expand_dim_changed",
+      // "test_expand_dim_unchanged",
+      // "test_eyelike_populate_off_main_diagonal",
+      // "test_eyelike_with_dtype",
+      // "test_eyelike_without_dtype",
+      "test_flatten_axis0",
+      "test_flatten_axis1",
+      "test_flatten_axis2",
+      "test_flatten_axis3",
+      "test_flatten_default_axis",
+      "test_flatten_negative_axis1",
+      "test_flatten_negative_axis2",
+      "test_flatten_negative_axis3",
+      "test_flatten_negative_axis4",
+      "test_floor_example",
+      "test_floor",
+      // "test_gather_0",
+      // "test_gather_1",
+      // "test_gather_2d_indices",
+      // "test_gather_negative_indices",
+      // "test_gather_elements_0",
+      // "test_gather_elements_1",
+      // "test_gather_elements_negative_indices",
+      // "test_gather_negative_indices",
+      // "test_gathernd_example_float32",
+      // "test_gathernd_example_int32_batch_dim1",
+      // "test_gathernd_example_int32",
+      "test_gemm_all_attributes",
+      "test_gemm_alpha",
+      "test_gemm_beta",
       "test_gemm_broadcast",
-      "test_matmul_2d",
-      "test_matmul_3d",
-      "test_matmul_4d",
-      "test_softmax_axis_0",
-      "test_softmax_axis_1",
-      "test_softmax_axis_2",
-      "test_softmax_default_axis",
-      "test_softmax_example",
-      "test_softmax_large_number",
-      "test_sum_example",
-      "test_sum_one_input",
-      "test_sum_two_inputs",
-      "test_averagepool_1d_default",
-      "test_averagepool_2d_default",
-      "test_averagepool_2d_pads",
-      "test_averagepool_2d_precomputed_pads",
-      "test_averagepool_2d_precomputed_same_upper",
-      "test_averagepool_2d_precomputed_strides",
-      "test_averagepool_2d_same_upper",
-      "test_averagepool_2d_same_lower",
-      "test_averagepool_2d_strides",
-      "test_averagepool_3d_default",
-      "test_maxpool_1d_default",
+      "test_gemm_default_matrix_bias",
+      "test_gemm_default_no_bias",
+      // "test_gemm_default_scalar_bias",
+      "test_gemm_default_single_elem_vector_bias",
+      "test_gemm_default_vector_bias",
+      "test_gemm_default_zero_bias",
+      "test_gemm_nobroadcast",
+      "test_gemm_transposeA",
+      "test_gemm_transposeB",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      // "test_globalmaxpool_precomputed",
+      // "test_globalmaxpool",
+      // "test_greater_bcast",
+      // "test_greater_equal_bcast_expanded",
+      // "test_greater_equal_bcast",
+      // "test_greater_equal_expanded",
+      // "test_greater_equal",
+      // "test_greater",
+      // // "test_gridsample_aligncorners_true",
+      // // "test_gridsample_bicubic",
+      // // "test_gridsample_bilinear",
+      // // "test_gridsample_border_padding",
+      // // "test_gridsample_nearest",
+      // // "test_gridsample_reflection_padding",
+      // // "test_gridsample_zeros_padding",
+      // // "test_gridsample",
+      // // "test_gru_batchwise",
+      // // "test_gru_defaults",
+      // // "test_gru_seq_length",
+      // // "test_gru_with_initial_bias",
+      // // "test_hammingwindow_expanded",
+      // // "test_hammingwindow_symmetric_expanded",
+      // // "test_hammingwindow_symmetric",
+      // // "test_hammingwindow",
+      // // "test_hannwindow_expanded",
+      // // "test_hannwindow_symmetric_expanded",
+      // // "test_hannwindow_symmetric",
+      // // "test_hannwindow",
+      // // "test_hardmax_axis_0",
+      // // "test_hardmax_axis_1",
+      // // "test_hardmax_axis_2",
+      // // "test_hardmax_default_axis",
+      // // "test_hardmax_example",
+      // // "test_hardmax_negative_axis",
+      // // "test_hardmax_one_hot",
+      // // "test_hardsigmoid_default",
+      // // "test_hardsigmoid_example",
+      // // "test_hardsigmoid",
+      // "test_hardswish_expanded",
+      "test_hardswish",
+      // "test_if",
+      // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra
+      // supports Sequence and Optional types
+      // "test_if_seq",
+      // "test_if_opt",
+      "test_instancenorm_epsilon",
+      // "test_instancenorm_example",
+      // "test_isinf_negative",
+      // "test_isinf_positive",
+      // "test_isinf",
+      // "test_isnan",
+      // "test_layer_normalization_2d_axis_negative_1_expanded",
+      // "test_layer_normalization_2d_axis_negative_1",
+      // "test_layer_normalization_2d_axis_negative_2_expanded",
+      // "test_layer_normalization_2d_axis_negative_2",
+      // "test_layer_normalization_2d_axis0_expanded",
+      // "test_layer_normalization_2d_axis0",
+      // "test_layer_normalization_2d_axis1_expanded",
+      // "test_layer_normalization_2d_axis1",
+      // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_1_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_2_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_3_epsilon",
+      // // "test_layer_normalization_3d_axis0_epsilon_expanded",
+      // "test_layer_normalization_3d_axis0_epsilon",
+      // "test_layer_normalization_3d_axis1_epsilon_expanded",
+      // "test_layer_normalization_3d_axis1_epsilon",
+      // // "test_layer_normalization_3d_axis2_epsilon_expanded",
+      // "test_layer_normalization_3d_axis2_epsilon",
+      // "test_layer_normalization_4d_axis_negative_1_expanded",
+      // "test_layer_normalization_4d_axis_negative_1",
+      // // "test_layer_normalization_4d_axis_negative_2_expanded",
+      // "test_layer_normalization_4d_axis_negative_2",
+      // "test_layer_normalization_4d_axis_negative_3_expanded",
+      // "test_layer_normalization_4d_axis_negative_3",
+      // "test_layer_normalization_4d_axis_negative_4_expanded",
+      // "test_layer_normalization_4d_axis_negative_4",
+      // "test_layer_normalization_4d_axis0_expanded",
+      // "test_layer_normalization_4d_axis0",
+      // "test_layer_normalization_4d_axis1_expanded",
+      // "test_layer_normalization_4d_axis1",
+      // "test_layer_normalization_4d_axis2_expanded",
+      // "test_layer_normalization_4d_axis2",
+      // "test_layer_normalization_4d_axis3_expanded",
+      // "test_layer_normalization_4d_axis3",
+      // "test_layer_normalization_default_axis_expanded",
+      // "test_layer_normalization_default_axis",
+      "test_leakyrelu_default",
+      "test_leakyrelu_example",
+      "test_leakyrelu",
+      // "test_less_bcast",
+      // "test_less_equal_bcast_expanded",
+      // "test_less_equal_bcast",
+      // "test_less_equal_expanded",
+      // "test_less_equal",
+      // "test_less",
+      "test_log_example",
+      "test_log",
+      // // "test_logsoftmax_axis_0_expanded",
+      // // "test_logsoftmax_axis_0",
+      // // "test_logsoftmax_axis_1_expanded",
+      // // "test_logsoftmax_axis_1",
+      // // "test_logsoftmax_axis_2_expanded",
+      // // "test_logsoftmax_axis_2",
+      // // "test_logsoftmax_default_axis_expanded",
+      // // "test_logsoftmax_default_axis",
+      // // "test_logsoftmax_example_1_expanded",
+      // // "test_logsoftmax_example_1",
+      // // "test_logsoftmax_large_number_expanded",
+      // // "test_logsoftmax_large_number",
+      // // "test_logsoftmax_negative_axis_expanded",
+      // // "test_logsoftmax_negative_axis",
+      // "test_lrn_default",
+      // "test_lrn",
+      // // "test_lstm_batchwise",
+      // // "test_lstm_defaults",
+      // // "test_lstm_with_initial_bias",
+      // // "test_lstm_with_peepholes",
+      // "test_matmul_2d",
+      // "test_matmul_3d",
+      // "test_matmul_4d",
+      // // "test_matmulinteger",
+      // "test_max_example",
+      // "test_max_float16",
+      // "test_max_float32",
+      // "test_max_float64",
+      // "test_max_int16",
+      // "test_max_int32",
+      // "test_max_int64",
+      // "test_max_int8",
+      // "test_max_one_input",
+      // "test_max_two_inputs",
+      // "test_max_uint16",
+      // "test_max_uint32",
+      // "test_max_uint64",
+      // "test_max_uint8",
+      // "test_maxpool_1d_default",
+      // "test_maxpool_2d_ceil",
       "test_maxpool_2d_default",
+      // "test_maxpool_2d_dilations",
       "test_maxpool_2d_pads",
       "test_maxpool_2d_precomputed_pads",
       "test_maxpool_2d_precomputed_same_upper",
@@ -1587,13 +1926,622 @@
       "test_maxpool_2d_same_lower",
       "test_maxpool_2d_same_upper",
       "test_maxpool_2d_strides",
-      "test_maxpool_3d_default",
-      "test_globalaveragepool_precomputed",
-      "test_globalaveragepool",
-      "test_globalmaxpool_precomputed",
-      "test_globalmaxpool",
-      "test_instancenorm_epsilon",
-      "test_instancenorm_example"
+      // "test_maxpool_2d_uint8",
+      // "test_maxpool_3d_default",
+      // "test_maxpool_with_argmax_2d_precomputed_pads",
+      // "test_maxpool_with_argmax_2d_precomputed_strides",
+      // // "test_maxunpool_export_with_output_shape",
+      // // "test_maxunpool_export_without_output_shape",
+      // // "test_mean_example",
+      // // "test_mean_one_input",
+      // // "test_mean_two_inputs",
+      // // "test_melweightmatrix",
+      // "test_min_example",
+      // "test_min_float16",
+      // "test_min_float32",
+      // "test_min_float64",
+      // "test_min_int16",
+      // "test_min_int32",
+      // "test_min_int64",
+      // "test_min_int8",
+      // "test_min_one_input",
+      // "test_min_two_inputs",
+      // "test_min_uint16",
+      // "test_min_uint32",
+      // "test_min_uint64",
+      // "test_min_uint8",
+      // "test_mod_bcast",
+      // "test_mod_broadcast",
+      // "test_mod_float_mixed_sign_example",
+      // "test_mod_fmod_mixed_sign_example",
+      // "test_mod_int64_fmod",
+      // "test_mod_int64_mixed_sign_example",
+      // "test_mod_mixed_sign_float16",
+      // "test_mod_mixed_sign_float32",
+      // "test_mod_mixed_sign_float64",
+      // "test_mod_mixed_sign_int16",
+      // "test_mod_mixed_sign_int32",
+      // "test_mod_mixed_sign_int64",
+      // "test_mod_mixed_sign_int8",
+      // "test_mod_uint16",
+      // "test_mod_uint32",
+      // "test_mod_uint64",
+      // "test_mod_uint8",
+      // // "test_momentum_multiple",
+      // // "test_momentum",
+      "test_mul_bcast",
+      "test_mul_example",
+      // "test_mul_uint8",
+      "test_mul",
+      // "test_mvn_expanded",
+      // "test_mvn",
+      "test_neg_example",
+      "test_neg",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // // "test_nesterov_momentum",
+      // // "test_nllloss_NC_expanded",
+      // // "test_nllloss_NC",
+      // // "test_nllloss_NCd1_expanded",
+      // // "test_nllloss_NCd1_ii_expanded",
+      // // "test_nllloss_NCd1_ii",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii",
+      // // "test_nllloss_NCd1_weight_expanded",
+      // // "test_nllloss_NCd1_weight_ii_expanded",
+      // // "test_nllloss_NCd1_weight_ii",
+      // // "test_nllloss_NCd1_weight",
+      // // "test_nllloss_NCd1",
+      // // "test_nllloss_NCd1d2_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii",
+      // // "test_nllloss_NCd1d2_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_reduction_mean",
+      // // "test_nllloss_NCd1d2_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight",
+      // // "test_nllloss_NCd1d2",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight",
+      // "test_nonmaxsuppression_center_point_box_format",
+      // "test_nonmaxsuppression_flipped_coordinates",
+      // "test_nonmaxsuppression_identical_boxes",
+      // "test_nonmaxsuppression_limit_output_size",
+      // "test_nonmaxsuppression_single_box",
+      // "test_nonmaxsuppression_suppress_by_IOU_and_scores",
+      // "test_nonmaxsuppression_suppress_by_IOU",
+      // "test_nonmaxsuppression_two_batches",
+      // "test_nonmaxsuppression_two_classes",
+      // "test_nonzero_example",
+      // "test_not_2d",
+      // "test_not_3d",
+      // "test_not_4d",
+      // // "test_onehot_negative_indices",
+      // // "test_onehot_with_axis",
+      // // "test_onehot_with_negative_axis",
+      // // "test_onehot_without_axis",
+      // // "test_optional_get_element_sequence",
+      // // "test_optional_get_element",
+      // // "test_optional_has_element_empty",
+      // // "test_optional_has_element",
+      // "test_or_bcast3v1d",
+      // "test_or_bcast3v2d",
+      // "test_or_bcast4v2d",
+      // "test_or_bcast4v3d",
+      // "test_or_bcast4v4d",
+      // "test_or2d",
+      // "test_or3d",
+      // "test_or4d",
+      "test_pow_bcast_array",
+      "test_pow_bcast_scalar",
+      "test_pow_example",
+      // "test_pow_types_float",
+      // "test_pow_types_float32_int32",
+      // "test_pow_types_float32_int64",
+      // "test_pow_types_float32_uint32",
+      // "test_pow_types_float32_uint64",
+      // "test_pow_types_int",
+      // "test_pow_types_int32_float32",
+      // "test_pow_types_int32_int32",
+      // "test_pow_types_int64_float32",
+      // "test_pow_types_int64_int64",
+      "test_pow",
+      // "test_prelu_broadcast",
+      // "test_prelu_example",
+      // // "test_qlinearconv",
+      // // "test_qlinearmatmul_2D",
+      // // "test_qlinearmatmul_3D",
+      // // "test_quantizelinear_axis",
+      // // "test_quantizelinear",
+      // "test_range_float_type_positive_delta_expanded",
+      // "test_range_float_type_positive_delta",
+      // "test_range_int32_type_negative_delta_expanded",
+      // "test_range_int32_type_negative_delta",
+      // "test_reciprocal_example",
+      // "test_reciprocal",
+      // "test_reduce_l1_default_axes_keepdims_example",
+      // "test_reduce_l1_default_axes_keepdims_random",
+      // "test_reduce_l1_do_not_keepdims_example",
+      // "test_reduce_l1_do_not_keepdims_random",
+      // "test_reduce_l1_keep_dims_example",
+      // "test_reduce_l1_keep_dims_random",
+      // "test_reduce_l1_negative_axes_keep_dims_example",
+      // "test_reduce_l1_negative_axes_keep_dims_random",
+      // "test_reduce_l2_default_axes_keepdims_example",
+      // "test_reduce_l2_default_axes_keepdims_random",
+      // "test_reduce_l2_do_not_keepdims_example",
+      // "test_reduce_l2_do_not_keepdims_random",
+      // "test_reduce_l2_keep_dims_example",
+      // "test_reduce_l2_keep_dims_random",
+      // "test_reduce_l2_negative_axes_keep_dims_example",
+      // "test_reduce_l2_negative_axes_keep_dims_random",
+      // "test_reduce_log_sum_asc_axes",
+      // "test_reduce_log_sum_default",
+      // "test_reduce_log_sum_desc_axes",
+      // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      // "test_reduce_log_sum_negative_axes",
+      // "test_reduce_log_sum",
+      // "test_reduce_max_default_axes_keepdim_example",
+      // "test_reduce_max_default_axes_keepdims_random",
+      // "test_reduce_max_do_not_keepdims_example",
+      // "test_reduce_max_do_not_keepdims_random",
+      // "test_reduce_max_keepdims_example",
+      // "test_reduce_max_keepdims_random",
+      // "test_reduce_max_negative_axes_keepdims_example",
+      // "test_reduce_max_negative_axes_keepdims_random",
+      // "test_reduce_mean_default_axes_keepdims_example",
+      // "test_reduce_mean_default_axes_keepdims_random",
+      // "test_reduce_mean_do_not_keepdims_example",
+      // "test_reduce_mean_do_not_keepdims_random",
+      // "test_reduce_mean_keepdims_example",
+      // "test_reduce_mean_keepdims_random",
+      // "test_reduce_mean_negative_axes_keepdims_example",
+      // "test_reduce_mean_negative_axes_keepdims_random",
+      // "test_reduce_min_default_axes_keepdims_example",
+      // "test_reduce_min_default_axes_keepdims_random",
+      // "test_reduce_min_do_not_keepdims_example",
+      // "test_reduce_min_do_not_keepdims_random",
+      // "test_reduce_min_keepdims_example",
+      // "test_reduce_min_keepdims_random",
+      // "test_reduce_min_negative_axes_keepdims_example",
+      // "test_reduce_min_negative_axes_keepdims_random",
+      // "test_reduce_prod_default_axes_keepdims_example",
+      // "test_reduce_prod_default_axes_keepdims_random",
+      // "test_reduce_prod_do_not_keepdims_example",
+      // "test_reduce_prod_do_not_keepdims_random",
+      // "test_reduce_prod_keepdims_example",
+      // "test_reduce_prod_keepdims_random",
+      // "test_reduce_prod_negative_axes_keepdims_example",
+      // "test_reduce_prod_negative_axes_keepdims_random",
+      // "test_reduce_sum_default_axes_keepdims_example",
+      // "test_reduce_sum_default_axes_keepdims_random",
+      // "test_reduce_sum_do_not_keepdims_example",
+      // "test_reduce_sum_do_not_keepdims_random",
+      // "test_reduce_sum_empty_axes_input_noop_example",
+      // "test_reduce_sum_empty_axes_input_noop_random",
+      // "test_reduce_sum_keepdims_example",
+      // "test_reduce_sum_keepdims_random",
+      // "test_reduce_sum_negative_axes_keepdims_example",
+      // "test_reduce_sum_negative_axes_keepdims_random",
+      // "test_reduce_sum_square_default_axes_keepdims_example",
+      // "test_reduce_sum_square_default_axes_keepdims_random",
+      // "test_reduce_sum_square_do_not_keepdims_example",
+      // "test_reduce_sum_square_do_not_keepdims_random",
+      // "test_reduce_sum_square_keepdims_example",
+      // "test_reduce_sum_square_keepdims_random",
+      // "test_reduce_sum_square_negative_axes_keepdims_example",
+      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      // "test_reflect_pad",
+      "test_relu",
+      // "test_reshape_allowzero_reordered",
+      // "test_reshape_extended_dims",
+      // "test_reshape_negative_dim",
+      // "test_reshape_negative_extended_dims",
+      // "test_reshape_one_dim",
+      // "test_reshape_reduced_dims",
+      // "test_reshape_reordered_all_dims",
+      // "test_reshape_reordered_dims",
+      // "test_reshape_reordered_last_dims",
+      // "test_reshape_zero_and_negative_dim",
+      // "test_reshape_zero_dim",
+      // "test_resize_downsample_linear",
+      // "test_resize_downsample_nearest",
+      // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_downsample_scales_cubic_align_corners",
+      // "test_resize_downsample_scales_cubic",
+      // "test_resize_downsample_scales_linear_align_corners",
+      // "test_resize_downsample_scales_linear",
+      // "test_resize_downsample_scales_nearest",
+      // "test_resize_downsample_sizes_cubic",
+      // "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+      // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+      // "test_resize_downsample_sizes_nearest",
+      // "test_resize_nearest",
+      // "test_resize_tf_crop_and_resize",
+      // "test_resize_upsample_linear",
+      // "test_resize_upsample_nearest",
+      // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_upsample_scales_cubic_align_corners",
+      // "test_resize_upsample_scales_cubic_asymmetric",
+      // "test_resize_upsample_scales_cubic",
+      // "test_resize_upsample_scales_linear_align_corners",
+      // "test_resize_upsample_scales_linear",
+      // "test_resize_upsample_scales_nearest",
+      // "test_resize_upsample_sizes_cubic",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+      // "test_resize_upsample_sizes_nearest",
+      // // "test_reversesequence_batch",
+      // // "test_reversesequence_time",
+      // // "test_rnn_seq_length",
+      // // "test_roialign_aligned_false",
+      // // "test_roialign_aligned_true",
+      // // "test_roialign",
+      // // "test_round",
+      // // "test_scan_sum",
+      // // "test_scan9_sum",
+      // // "test_scatter_elements_with_axis",
+      // // "test_scatter_elements_with_duplicate_indices",
+      // // "test_scatter_elements_with_negative_indices",
+      // // "test_scatter_elements_without_axis",
+      // // "test_scatter_with_axis",
+      // // "test_scatter_without_axis",
+      // // "test_scatternd_add",
+      // // "test_scatternd_multiply",
+      // // "test_scatternd",
+      // // "test_sce_mean_3d_expanded",
+      // // "test_sce_mean_3d_log_prob_expanded",
+      // // "test_sce_mean_3d_log_prob",
+      // // "test_sce_mean_3d",
+      // // "test_sce_mean_expanded",
+      // // "test_sce_mean_log_prob_expanded",
+      // // "test_sce_mean_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d",
+      // // "test_sce_mean_no_weight_ii_4d_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob",
+      // // "test_sce_mean_no_weight_ii_4d",
+      // // "test_sce_mean_no_weight_ii_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob",
+      // // "test_sce_mean_no_weight_ii",
+      // // "test_sce_mean_weight_expanded",
+      // // "test_sce_mean_weight_ii_3d_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob",
+      // // "test_sce_mean_weight_ii_3d",
+      // // "test_sce_mean_weight_ii_4d_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob",
+      // // "test_sce_mean_weight_ii_4d",
+      // // "test_sce_mean_weight_ii_expanded",
+      // // "test_sce_mean_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_log_prob",
+      // // "test_sce_mean_weight_ii",
+      // // "test_sce_mean_weight_log_prob_expanded",
+      // // "test_sce_mean_weight_log_prob",
+      // // "test_sce_mean_weight",
+      // // "test_sce_mean",
+      // // "test_sce_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1_mean_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight",
+      // // "test_sce_none_expanded",
+      // // "test_sce_none_log_prob_expanded",
+      // // "test_sce_none_log_prob",
+      // // "test_sce_none_weights_expanded",
+      // // "test_sce_none_weights_log_prob_expanded",
+      // // "test_sce_none_weights_log_prob",
+      // // "test_sce_none_weights",
+      // // "test_sce_none",
+      // // "test_sce_sum_expanded",
+      // // "test_sce_sum_log_prob_expanded",
+      // // "test_sce_sum_log_prob",
+      // // "test_sce_sum",
+      // "test_selu_default",
+      // "test_selu_example",
+      // "test_selu",
+      // // "test_sequence_insert_at_back",
+      // // "test_sequence_insert_at_front",
+      // // "test_sequence_map_add_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_add_1_sequence_1_tensor",
+      // // "test_sequence_map_add_2_sequences_expanded",
+      // // "test_sequence_map_add_2_sequences",
+      // // "test_sequence_map_extract_shapes_expanded",
+      // // "test_sequence_map_extract_shapes",
+      // // "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_identity_1_sequence_1_tensor",
+      // // "test_sequence_map_identity_1_sequence_expanded",
+      // // "test_sequence_map_identity_1_sequence",
+      // // "test_sequence_map_identity_2_sequences_expanded",
+      // // "test_sequence_map_identity_2_sequences",
+      // "test_shrink_hard",
+      // "test_shrink_soft",
+      "test_sigmoid_example",
+      "test_sigmoid",
+      // "test_sign",
+      // "test_simple_rnn_batchwise",
+      // "test_simple_rnn_defaults",
+      // "test_simple_rnn_with_initial_bias",
+      "test_sin_example",
+      "test_sin",
+      // "test_sinh_example",
+      // "test_sinh",
+      // // "test_size_example",
+      // // "test_size",
+      // "test_slice_default_axes",
+      // "test_slice_default_steps",
+      // "test_slice_end_out_of_bounds",
+      // "test_slice_neg_steps",
+      // "test_slice_neg",
+      // "test_slice_negative_axes",
+      // "test_slice_start_out_of_bounds",
+      // "test_slice",
+      // "test_softmax_axis_0_expanded",
+      "test_softmax_axis_0",
+      // "test_softmax_axis_1_expanded",
+      "test_softmax_axis_1",
+      // "test_softmax_axis_2_expanded",
+      "test_softmax_axis_2",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // "test_softmax_cross_entropy_mean_3d_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_3d",
+      // "test_softmax_cross_entropy_mean_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_mean_weight",
+      // "test_softmax_cross_entropy_mean",
+      // "test_softmax_cross_entropy_none_expanded",
+      // "test_softmax_cross_entropy_none_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_log_prob",
+      // "test_softmax_cross_entropy_none_weights_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob",
+      // "test_softmax_cross_entropy_none_weights",
+      // "test_softmax_cross_entropy_none",
+      // "test_softmax_cross_entropy_sum_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob",
+      // "test_softmax_cross_entropy_sum",
+      // "opset13/test_softmax_default_axis_expanded",
+      "opset13/test_softmax_default_axis",
+      // "test_softmax_example_expanded",
+      "test_softmax_example",
+      // "test_softmax_large_number_expanded",
+      "test_softmax_large_number",
+      // "test_softmax_negative_axis_expanded",
+      "test_softmax_negative_axis",
+      // // "test_softplus_example",
+      // // "test_softplus",
+      // // "test_softsign_example",
+      // // "test_softsign",
+      // "test_spacetodepth_example",
+      // "test_spacetodepth",
+      // "test_split_equal_parts_1d",
+      // "test_split_equal_parts_2d",
+      // "test_split_equal_parts_default_axis",
+      // "test_split_variable_parts_1d",
+      // "test_split_variable_parts_2d",
+      // "test_split_variable_parts_default_axis",
+      // "test_split_zero_size_splits",
+      "test_sqrt_example",
+      "test_sqrt",
+      // "test_squeeze_negative_axes",
+      // "test_squeeze",
+      // // "test_stft_with_window",
+      // // "test_stft",
+      // // "test_strnormalizer_export_monday_casesensintive_lower",
+      // // "test_strnormalizer_export_monday_casesensintive_nochangecase",
+      // // "test_strnormalizer_export_monday_casesensintive_upper",
+      // // "test_strnormalizer_export_monday_empty_output",
+      // // "test_strnormalizer_export_monday_insensintive_upper_twodim",
+      // // "test_strnormalizer_nostopwords_nochangecase",
+      "test_sub_bcast",
+      "test_sub_example",
+      // "test_sub_uint8",
+      "test_sub",
+      // "test_sum_example",
+      // "test_sum_one_input",
+      // "test_sum_two_inputs",
+      "test_tan_example",
+      "test_tan",
+      "test_tanh_example",
+      "test_tanh",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
+      // // "test_tfidfvectorizer_tf_only_bigrams_skip0",
+      // // "test_tfidfvectorizer_tf_onlybigrams_levelempty",
+      // // "test_tfidfvectorizer_tf_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_uniandbigrams_skip5",
+      // "test_thresholdedrelu_default",
+      // "test_thresholdedrelu_example",
+      // "test_thresholdedrelu",
+      // "test_tile_precomputed",
+      // "test_tile",
+      // // "test_top_k_negative_axis",
+      // // "test_top_k_smallest",
+      // // "test_top_k",
+      // // "test_training_dropout_default_mask",
+      // // "test_training_dropout_default",
+      // // "test_training_dropout_mask",
+      // // "test_training_dropout_zero_ratio_mask",
+      // // "test_training_dropout_zero_ratio",
+      // // "test_training_dropout",
+      "test_transpose_all_permutations_0",
+      "test_transpose_all_permutations_1",
+      "test_transpose_all_permutations_2",
+      "test_transpose_all_permutations_3",
+      "test_transpose_all_permutations_4",
+      "test_transpose_all_permutations_5",
+      "test_transpose_default"
+      // "test_tril_neg",
+      // "test_tril_one_row_neg",
+      // "test_tril_out_neg",
+      // "test_tril_out_pos",
+      // "test_tril_pos",
+      // "test_tril_square_neg",
+      // "test_tril_square",
+      // "test_tril_zero",
+      // "test_tril",
+      // "test_triu_neg",
+      // "test_triu_one_row",
+      // "test_triu_out_neg_out",
+      // "test_triu_out_pos",
+      // "test_triu_pos",
+      // "test_triu_square_neg",
+      // "test_triu_square",
+      // "test_triu_zero",
+      // "test_triu",
+      // // "test_unique_not_sorted_without_axis",
+      // // "test_unique_sorted_with_axis_3d",
+      // // "test_unique_sorted_with_axis",
+      // // "test_unique_sorted_with_negative_axis",
+      // // "test_unique_sorted_without_axis",
+      // "test_unsqueeze_axis_0",
+      // "test_unsqueeze_axis_1",
+      // "test_unsqueeze_axis_2",
+      // "test_unsqueeze_axis_3",
+      // "test_unsqueeze_negative_axes",
+      // "test_unsqueeze_three_axes",
+      // "test_unsqueeze_two_axes",
+      // "test_unsqueeze_unsorted_axes",
+      // "test_unsqueeze",
+      // "test_wrap_pad"
+      // "test_upsample_nearest",
+      // "test_where_example",
+      // "test_where_long_example",
+      // "test_xor_bcast3v1d",
+      // "test_xor_bcast3v2d",
+      // "test_xor_bcast4v2d",
+      // "test_xor_bcast4v3d",
+      // "test_xor_bcast4v4d",
+      // "test_xor2d",
+      // "test_xor3d",
+      // "test_xor4d"
     ],
     "ops": []
   }
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 9bd0ec1425f95..2d83ce1e095ce 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,8 +110,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
-          context = await ModelTestContext.create(
-              test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
+          context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options);
         });
 
         after('release session', async () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 5e9b0910a2c68..6d5951be7b1e6 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -137,8 +137,9 @@ async function loadTensors(
 }
 
 async function initializeSession(
-    modelFilePath: string, backendHint: string, ioBindingMode: Test.IOBindingMode, profile: boolean,
-    sessionOptions: ort.InferenceSession.SessionOptions, fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
+    modelFilePath: string, backendHint: ort.InferenceSession.ExecutionProviderConfig, ioBindingMode: Test.IOBindingMode,
+    profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions,
+    fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
   const preloadModelData: Uint8Array|undefined =
       fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
   Logger.verbose(
@@ -232,9 +233,8 @@ export class ModelTestContext {
   /**
    * create a ModelTestContext object that used in every test cases in the given ModelTest.
    */
-  static async create(
-      modelTest: Test.ModelTest, profile: boolean,
-      sessionOptions?: ort.InferenceSession.SessionOptions): Promise<ModelTestContext> {
+  static async create(modelTest: Test.ModelTest, profile: boolean, testOptions?: Test.Options):
+      Promise<ModelTestContext> {
     if (this.initializing) {
       throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
     }
@@ -243,8 +243,12 @@ export class ModelTestContext {
       this.initializing = true;
 
       const initStart = now();
+      const executionProviderConfig =
+          modelTest.backend === 'webnn' ? (testOptions?.webnnOptions || 'webnn') : modelTest.backend!;
       const session = await initializeSession(
-          modelTest.modelUrl, modelTest.backend!, modelTest.ioBinding, profile, sessionOptions || {}, this.cache);
+          modelTest.modelUrl, executionProviderConfig, modelTest.ioBinding, profile, testOptions?.sessionOptions || {},
+          this.cache);
+
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index 5bdc8d84cc7a5..cd008e82e570b 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -143,6 +143,7 @@ export declare namespace Test {
     cudaFlags?: Record<string, unknown>;
     wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
     webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+    webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
     globalEnvFlags?: EnvOptions;
   }
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index c9ed23895b60c..da489a6901512 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -133,6 +133,10 @@ constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FL
 // Default value for the above setting.
 constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
 
+// Environment variable to enable loading more KV data in flight in
+// DecoderMaskedMultiHeadAttention/DecoderMaskedSelfAttention kernels
+constexpr const char* kDecoderMaskedAttentionLoadKVDataInFlight = "ORT_DECODER_MASKED_ATTENTION_LOAD_KV_DATA_IN_FLIGHT";
+
 }  // namespace attention
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
index 54aad9cbaf387..a9b60da0c96ca 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
@@ -70,6 +70,10 @@ Status DecoderMaskedMultiHeadAttention<T1, T2>::ComputeInternal(OpKernelContext*
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
   bool is_dmmha_packing = (key == nullptr && value == nullptr);
   ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs<Tensor>(query,
                                                                       key,
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
index 69ed07101e647..72ede2e22b557 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
@@ -52,6 +52,10 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
index 33e7a33494778..9efb6f08e8e99 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -344,52 +344,148 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   bool has_beams = params.cache_indir != nullptr && !params.is_cross_attention;
   const int* beam_indices = has_beams ? &params.cache_indir[bi_max_seq_length] : nullptr;
 
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
+  if (!params.kv_data_in_flight) {
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
 
-    // The keys loaded from the key cache.
-    K_vec_k k_vec[K_VECS_PER_THREAD];
-    if (ti < tlength) {
-      if (has_beams) {
-        const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_VECS_PER_THREAD];
+      if (ti < tlength) {
+        if (has_beams) {
+          const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
 
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
-        }
-      } else {
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
+          }
+        } else {
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+          }
         }
       }
-    }
 
-    // Perform the dot product and normalize qk.
-    // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
 
-    // This is a deviation from FasterTransformer kernel implementation
-    // but this aligns with ORT's other Attention kernels which strives to
-    // mimic PyTorch when dealing with mask filter values
-    if (is_masked) {
-      qk += params.mask_filter_value;
+      // This is a deviation from FasterTransformer kernel implementation
+      // but this aligns with ORT's other Attention kernels which strives to
+      // mimic PyTorch when dealing with mask filter values
+      if (is_masked) {
+        qk += params.mask_filter_value;
+      }
+
+      // Store the product to shared memory. There's one qk value per timestep. Update the max.
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        if (params.relative_attention_bias != nullptr) {
+          qk = add_vec(qk,
+                       reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+        }
+        qk_max = fmaxf(qk_max, qk);
+        qk_smem[ti] = qk;
+      }
     }
+  } else {
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int K_CACHE_DATA_LOAD_UNROLL = 4;
 
-    // Store the product to shared memory. There's one qk value per timestep. Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      if (params.relative_attention_bias != nullptr) {
-        qk = add_vec(qk,
-                     reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+    for (int ti = ko; ti < ti_end; ti += (K_CACHE_DATA_LOAD_UNROLL * K_PER_ITER)) {
+      int is_masked[K_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[K_CACHE_DATA_LOAD_UNROLL];
+      int time_step[K_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[K_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        is_masked[k_unroll] = 1;
+        beam_offset[k_unroll] = 0;
+        time_step[k_unroll] = ti + k_unroll * K_PER_ITER;
+        time_bounds_cond[k_unroll] = (time_step[k_unroll] < tlength);
+      }
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && params.mask != nullptr) {
+          is_masked[k_unroll] = params.mask[bi_total_seq_length + time_step[k_unroll]];
+        }
+      }
+
+      if (has_beams) {
+        int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+#pragma unroll
+        for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+          if (time_bounds_cond[k_unroll]) {
+            beam_offset[k_unroll] = beam_indices[time_step[k_unroll]] * head_maxlength_headsize_prod;
+          }
+        }
+      }
+
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_CACHE_DATA_LOAD_UNROLL][K_VECS_PER_THREAD];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll]) {
+          if (has_beams) {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset[k_unroll] + jj * QK_ELTS_IN_16B])));
+            }
+          } else {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            }
+          }
+        }
+      }
+
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk[K_CACHE_DATA_LOAD_UNROLL];
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        qk[k_unroll] = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec[k_unroll]) * inv_sqrt_dh;
+      }
+
+// This is a deviation from FasterTransformer kernel implementation
+// but this aligns with ORT's other Attention kernels which strives to
+// mimic PyTorch when dealing with mask filter values
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && is_masked[k_unroll] == 0) {
+          qk[k_unroll] += params.mask_filter_value;
+        }
+      }
+
+// Store the product to shared memory. There's one qk value per timestep. Update the max.
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && (tidx % THREADS_PER_KEY == 0)) {
+          if (params.relative_attention_bias != nullptr) {
+            qk[k_unroll] = add_vec(qk[k_unroll],
+                                   reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + time_step[k_unroll]]);
+          }
+          qk_max = fmaxf(qk_max, qk[k_unroll]);
+          qk_smem[time_step[k_unroll]] = qk[k_unroll];
+        }
       }
-      qk_max = fmaxf(qk_max, qk);
-      qk_smem[ti] = qk;
     }
   }
 
@@ -504,18 +600,80 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   V_vec_acum out;
   zero(out);
 
-  // Loop over the timesteps to compute the partial outputs.
-  for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
-    // Fetch offset based on cache_indir when beam sampling
-    const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
-    const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+  if (!params.kv_data_in_flight) {
+    // Loop over the timesteps to compute the partial outputs.
+    for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
+      // Fetch offset based on cache_indir when beam sampling
+      const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
+      const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+
+      // Load the values from the cache.
+      V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+
+      // Load the logits from shared memory.
+      T logit = logits_smem[ti];
+      out = fma(logit, v, out);
+    }
+  } else {
+    // Loop over the timesteps to compute the partial outputs.
+
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int V_CACHE_DATA_LOAD_UNROLL = 8;
+
+    for (int ti = vo; ti < tlength; ti += V_CACHE_DATA_LOAD_UNROLL * V_PER_ITER) {
+      int beam_src[V_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[V_CACHE_DATA_LOAD_UNROLL];
+      int time_step[V_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[V_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        beam_src[v_unroll] = 0;
+        beam_offset[v_unroll] = 0;
+        time_step[v_unroll] = ti + v_unroll * V_PER_ITER;
+        time_bounds_cond[v_unroll] = (time_step[v_unroll] < tlength);
+      }
+
+      int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+      if (has_beams) {
+// Do the global memory read and corresponding compute in separate unrolled loops
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_src[v_unroll] = params.cache_indir[bi_max_seq_length + time_step[v_unroll]];
+          }
+        }
+
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_offset[v_unroll] = beam_src[v_unroll] * head_maxlength_headsize_prod;
+          }
+        }
+      }
 
-    // Load the values from the cache.
-    V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+      // Load the values from the V-cache and logits from shared memory.
+      V_vec_k v[V_CACHE_DATA_LOAD_UNROLL];
+      T logits[V_CACHE_DATA_LOAD_UNROLL];
 
-    // Load the logits from shared memory.
-    T logit = logits_smem[ti];
-    out = fma(logit, v, out);
+// Do the global memory read and compute in separate unrolled loops
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          v[v_unroll] = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset[v_unroll] + time_step[v_unroll] * head_size]));
+          logits[v_unroll] = logits_smem[time_step[v_unroll]];
+        }
+      }
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          out = fma(logits[v_unroll], v[v_unroll], out);
+        }
+      }
+    }
   }
 
   // One group of threads computes the product(s) for the current timestep.
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
index 4b408dafa2d81..1a17757d1ec2d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
@@ -22,6 +22,12 @@ struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters {
   bool is_cross_attention = false;
   bool is_packed_qkv = false;
 
+  // Useful to better use global memory bandwidth on certain CUDA architectures.
+  // Turned off by default for now until we fully understand performance implications
+  // for all types of workloads.
+  // Can be turned on by appropriate environment variable (see attention_common.h).
+  bool kv_data_in_flight = false;
+
   void* q = nullptr;
   void* q_bias = nullptr;
 
@@ -62,4 +68,4 @@ void mmha_launch_kernel(const DecoderMaskedMultiHeadAttentionParams& params, cud
 }  // namespace cuda
 
 }  // namespace contrib
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc
index d597e0d57fbcb..63804f79a32fb 100644
--- a/onnxruntime/contrib_ops/rocm/fused_conv.cc
+++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc
@@ -76,7 +76,12 @@ struct FNVHash {
   void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) {
     int spatial_dim = 1;
 #if ROCM_VERSION >= 50500
-    miopenGetConvolutionSpatialDim(cdesc, &spatial_dim);
+    MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim));
+    std::vector<int> pads{spatial_dim};
+    std::vector<int> strides{spatial_dim};
+    std::vector<int> dilations{spatial_dim};
+    miopenConvolutionMode_t mode;
+    MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode));
 #else
     // Previous versions of MIOpen doesn't provide API to probe the dimension of a
     // miopenConvolutionDescriptor_t, so we have to guess.
@@ -100,11 +105,12 @@ struct FNVHash {
     pads.resize(spatial_dim);
     strides.resize(spatial_dim);
     dilations.resize(spatial_dim);
+#endif
     (*this) << spatial_dim;
     (*this) << pads;
     (*this) << strides;
     (*this) << dilations;
-#endif
+    (*this) << mode;
   }
 
  private:
@@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv<T, false> {
       auto ret = miopenCompileFusionPlan(handle, fusion->plan);
       if (miopenStatusSuccess == ret) {
         fusion->compiled_on.insert(handle);
+      } else {
+        return ret;
       }
       return miopenStatusSuccess;
     }
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 40c59cfcf699d..796a018ac0f68 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -65,6 +65,11 @@ struct FreeDimensionOverride {
  * Configuration information for a session.
  */
 struct SessionOptions {
+#if defined(__wasm__) && defined(__EMSCRIPTEN_PTHREADS__)
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = false;
+#else
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = true;
+#endif
   ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL;
 
   // set the execution order of the graph
@@ -129,7 +134,8 @@ struct SessionOptions {
 
   // By default the session uses its own set of threadpools, unless this is set to false.
   // Use this in conjunction with the CreateEnvWithGlobalThreadPools API.
-  bool use_per_session_threads = true;
+  bool use_per_session_threads = DEFAULT_USE_PER_SESSION_THREADS;
+
   bool thread_pool_allow_spinning = true;
 
   // Deterministic compute is likely not as performant. This option is default to false.
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h
index f8f7dcd43699f..90fdd710e2773 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h
@@ -232,7 +232,7 @@ MlasSQNBitGemmOperation(
 
         size_t RowsRemaining = RangeCountM;
         while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
             auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
                 a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
             );
diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
index cc0f7854791d4..9d53e28921784 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
@@ -53,7 +53,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output
                                     MakeString("Added by ", kTransformerName),
                                     dq_inputs,
                                     {&new_dq_output_nodearg},
-                                    nullptr,  // attributes
+                                    &original_dq_node.GetAttributes(),
                                     original_dq_node.Domain());
 
   // set up edges
diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h
index d2f297e83aedb..f5288d7f231b0 100644
--- a/onnxruntime/core/providers/acl/math/gemm.h
+++ b/onnxruntime/core/providers/acl/math/gemm.h
@@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm<T> {
   }
 
   Status Compute(OpKernelContext* context) const override {
+#ifdef ACL_2308
+    if (this->packed_b_) {
+      // Prepacked RHS not supported, defaulting to cpu execution provider
+      return onnxruntime::Gemm<T>::Compute(context);
+    }
+#endif
     const auto A = context->Input<Tensor>(0);
     const auto B = context->Input<Tensor>(1);
     const auto C = context->Input<Tensor>(2);
 
-    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape());
+    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans,
+                      C != nullptr ? C->Shape() : TensorShape({}));
 
     if (!helper.State().IsOK())
       return helper.State();
@@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm<T> {
       return onnxruntime::Gemm<T>::Compute(context);
     }
 
-    arm_compute::TensorShape cShape = ACLTensorShape(C->Shape());
+    arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({}));
     if (useC &&
         (cShape.num_dimensions() > 2 ||
          (cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) {  // Multi-dimensional Bias
@@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm<T> {
           (cShape[1] == 1 && cShape[0] != (long unsigned int)N)) {
         return onnxruntime::Gemm<T>::Compute(context);
       }
+#ifdef ACL_2308
+      cShape = arm_compute::TensorShape(N);
+      LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}";
+#else
       cShape = arm_compute::TensorShape(1, N);
       LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}";
+#endif
     }
 
     int64_t K = helper.K();
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index da7fff730c96f..eb6a10074f1db 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -44,6 +44,16 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
   const Tensor* M = context->Input<Tensor>(3);  // mean
   const Tensor* V = context->Input<Tensor>(4);  // var
 
+  if (S->Shape().NumDimensions() > 1) {
+    LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
+  if (this->is_train_) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
   ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V));
 
   LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:";
@@ -70,7 +80,23 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
 
     auto layer = std::make_shared<arm_compute::NEBatchNormalizationLayer>();
 
+#ifdef ACL_2308
+    arm_compute::TensorShape in_x_shape;
+    const TensorShape& x_shape = X->Shape();
+    const auto& dims_vec = x_shape.GetDims();
+    in_x_shape.set(3, onnxruntime::narrow<size_t>(dims_vec[0]));  // N
+    in_x_shape.set(1, 1);                                         // H
+    size_t W = 1;
+    for (size_t i = 2; i < dims_vec.size(); ++i) {
+      W *= narrow<size_t>(dims_vec[i]);
+    }
+    in_x_shape.set(0, W);                                         // W
+    in_x_shape.set(2, onnxruntime::narrow<size_t>(dims_vec[1]));  // C
+
+    tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32));
+#else
     tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32));
+#endif
     tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32));
 
     tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32));
@@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     7, 9,
     kAclExecutionProvider,
     KernelDefBuilder()
-        .TypeConstraint("X", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("scale", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("B", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("mean", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("var", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     BatchNorm<float>);
 
 }  // namespace acl
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.h b/onnxruntime/core/providers/acl/nn/batch_norm.h
index c9ec08b67a779..264301976e6dc 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.h
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.h
@@ -31,9 +31,9 @@ typedef struct {
 typedef std::map<OpKernel*, ACLNEBatchNorm>::iterator BatchNormLayersIterator;
 
 template <typename T>
-class BatchNorm final : public OpKernel {
+class BatchNorm : public onnxruntime::BatchNorm<T> {
  public:
-  explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) {
+  explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm<T>(info) {
     auto st = info.GetAttr<float>("epsilon", &epsilon_);
     ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
 
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index 1613d927d0f74..85bd0cfe96279 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -105,7 +105,11 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
   TensorShapeVector Y_dims;
   Y_dims.insert(Y_dims.begin(), {N, M});
   TensorShape input_shape = X->Shape().Slice(2);
+#ifdef ACL_2308
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#else
   ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#endif
   Tensor* Y = context->Output(0, TensorShape(Y_dims));
   LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str();
 
@@ -222,6 +226,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
                                                                                           1 /* depth multiplier */,
                                                                                           acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
                                                                                           arm_compute::Size2D(aclDilation0, dilations[0])));
+#elif defined(ACL_2308)
+      bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
+                                                                                 tconv.k->info(),
+                                                                                 (B != nullptr) ? tconv.b->info() : nullptr,
+                                                                                 tconv.out->info(),
+                                                                                 aclPadStride,
+                                                                                 1 /* depth multiplier */,
+                                                                                 acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
+                                                                                 arm_compute::Size2D(aclDilation0, dilations[0])));
 #endif
 
       if (optimizable) {
@@ -230,7 +243,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
 #elif defined(ACL_1908)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayerOptimized>();
-#elif defined(ACL_2002)
+#elif defined(ACL_2002) || defined(ACL_2308)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>();
 #endif
 
@@ -238,7 +251,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo());
-#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002)
+#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308)
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
diff --git a/onnxruntime/core/providers/acl/nn/conv.h b/onnxruntime/core/providers/acl/nn/conv.h
index ecb11fb3c8f4e..660d47b4172df 100644
--- a/onnxruntime/core/providers/acl/nn/conv.h
+++ b/onnxruntime/core/providers/acl/nn/conv.h
@@ -8,6 +8,9 @@
 #include "core/providers/acl/acl_execution_provider.h"
 
 // ACL
+#ifdef ACL_2308
+#include "arm_compute/runtime/Tensor.h"
+#endif
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "arm_compute/runtime/Allocator.h"
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index dc79ae65bf21e..8fbcba3ed87a7 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
     tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
 
     if (pool_attrs.global_pooling) {
-      layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
+      layer->configure(tpool.in.get(),
+                       tpool.out.get(),
+                       arm_compute::PoolingLayerInfo(pool_type
+#ifdef ACL_2308
+                                                     ,
+                                                     arm_compute::DataLayout::NCHW
+#endif
+                                                     ));
     } else {
       TensorShapeVector aclStrides(2);
       aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
@@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
       LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
 
-      arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
+      arm_compute::PoolingLayerInfo pool_info(pool_type,
+                                              aclSize,
+#ifdef ACL_2308
+                                              arm_compute::DataLayout::NCHW,
+#endif
+                                              aclPadStride,
+                                              excludePadding);
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc
index 081472729cfcf..75eedaac80aea 100644
--- a/onnxruntime/core/providers/acl/tensor/concat.cc
+++ b/onnxruntime/core/providers/acl/tensor/concat.cc
@@ -10,6 +10,8 @@
 #include "core/providers/acl/acl_common.h"
 #include "core/providers/acl/acl_fwd.h"
 
+#include <iostream>
+
 #define PREF_DIM 4
 
 namespace onnxruntime {
@@ -22,17 +24,27 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     return onnxruntime::Concat::Compute(ctx);
   }
 
+  if (axis_ < 0) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation";
+    return onnxruntime::Concat::Compute(ctx);
+  }
+
   // Number of input tensors to concatenate
   auto input_count = Node().InputArgCount().front();
 
   // Hold pointers to the input tensors to be used in the PrepareForCompute() step
   std::vector<const Tensor*> input_tensors;
-  input_tensors.reserve(input_count);
+  int empty_tensors = 0;
   for (int i = 0; i < input_count; ++i) {
+    if (ctx->Input<Tensor>(i)->Shape().Size() == 0) {
+      empty_tensors++;
+      continue;
+    }
     input_tensors.push_back(ctx->Input<Tensor>(i));
   }
+  input_count -= empty_tensors;
 
-  auto output_dims = input_tensors[0]->Shape().AsShapeVector();
+  auto output_dims = ctx->Input<Tensor>(0)->Shape().AsShapeVector();
 
   // 'Concat' mode
   if (!is_stack_) {
@@ -64,7 +76,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   LOGS_DEFAULT(VERBOSE) << "Concat ACL:";
 
   arm_compute::Tensor output;
+#ifdef ACL_2308
+  std::vector<const arm_compute::ITensor*> inputs_vector;
+#else
   std::vector<arm_compute::ITensor*> inputs_vector;
+#endif
   for (int i = 0; i < input_count; i++) {
     arm_compute::Tensor* input = new arm_compute::Tensor();
     auto X = input_tensors[i];
@@ -75,7 +91,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   }
 
   arm_compute::NEConcatenateLayer layer;
-  layer.configure(inputs_vector, &output, 3 - axis_);
+  if (input_count > 0) {
+    layer.configure(inputs_vector, &output, 3 - axis_);
+  }
 
   LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
   LOGS_DEFAULT(VERBOSE) << std::endl;
@@ -83,7 +101,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   for (int i = 0; i < input_count; i++) {
     auto X = input_tensors[i];
     const T* x_data = X->Data<T>();
+#ifdef ACL_2308
+    arm_compute::Tensor* in = const_cast<arm_compute::Tensor*>(static_cast<const arm_compute::Tensor*>(inputs_vector[i]));
+#else
     arm_compute::Tensor* in = static_cast<arm_compute::Tensor*>(inputs_vector[i]);
+#endif
 
     if (X->Shape().Size() != 0 && in->info()->has_padding()) {
       in->allocator()->allocate();
@@ -101,7 +123,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4);
   }
 
-  layer.run();
+  if (input_count > 0) {
+    layer.run();
+  }
 
   if (Y->Shape().Size() != 0 && output.info()->has_padding()) {
     importDataFromTensor<T>(&output, y_data);
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 75f6f8d2eddd5..9cd0b3d0620af 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -989,6 +989,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
@@ -2448,6 +2449,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
   };
 
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.cc b/onnxruntime/core/providers/cpu/text/string_concat.cc
new file mode 100644
index 0000000000000..bc626f8e055aa
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.cc
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_concat.h"
+#include "core/providers/cpu/math/element_wise_ops.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(StringConcat, 20,
+                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<std::string>()),
+                         StringConcat);
+
+Status StringConcat::Compute(OpKernelContext* context) const {
+  ProcessBroadcastSpanFuncs broadcast_funcs{[](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.ScalarInput0<std::string>();
+                                              auto y = broadcast_helper.SpanInput1<std::string>();
+                                              auto y_iter = y.begin();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto x_size = x.length();
+                                              while (y_iter != y.end()) {
+                                                output_iter->reserve(x_size + y_iter->length());
+                                                output_iter->append(x);
+                                                output_iter->append(*y_iter);
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.SpanInput0<std::string>();
+                                              auto x_iter = x.begin();
+                                              auto y = broadcast_helper.ScalarInput1<std::string>();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto y_size = y.length();
+                                              while (x_iter != x.end()) {
+                                                output_iter->reserve(y_size + x_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(y);
+                                                x_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x_iter = broadcast_helper.SpanInput0<std::string>().begin();
+                                              auto y_iter = broadcast_helper.SpanInput1<std::string>().begin();
+                                              auto output = broadcast_helper.OutputSpan<std::string>();
+                                              auto output_iter = output.begin();
+                                              while (output_iter != output.end()) {
+                                                output_iter->reserve(x_iter->length() + y_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(*y_iter);
+                                                x_iter++;
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            }};
+  UntypedBroadcastTwo(*context, broadcast_funcs);
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.h b/onnxruntime/core/providers/cpu/text/string_concat.h
new file mode 100644
index 0000000000000..63c1ea8a41146
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringConcat final : public OpKernel {
+ public:
+  StringConcat(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* context) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index adb4fd131119f..c6a15e76f4736 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder
                                 // The tensor description's size should be no larger than the constant input unless it was rounded to 
                                 // the required alignment.
                                 assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
-                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes);
+                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
                                 auto data = static_cast<const uint8_t*>(constantInput->GetData());
                                 std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
 
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 5c2d1f0b881ba..b850bea4bc275 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -67,6 +67,7 @@ namespace js {
                             float value;                                           \
                             ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
                                                                                  , ({#attr_name : $1}), static_cast<double>(value))
+#define JSEP_HEAP_PTR(ptr) reinterpret_cast<uintptr_t>(ptr)
 
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 5c0fbf93a4004..98a530c6b77f6 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -54,13 +54,13 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
                                    "format" : $11 ? "NHWC" : "NCHW",
@@ -81,14 +81,14 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(kernel_shape_1),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     }
   }
 
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 5d30dc851e00f..353a946e95c21 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel {
                                  static_cast<int32_t>(pads_1),
                                  static_cast<int32_t>(strides),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     } else {
       constexpr size_t pads_vec_size = 4;
@@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel {
                                    "activation" : UTF8ToString($13)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
-                                 reinterpret_cast<int32_t>(local_dilations.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_dilations.data()) >> 2,
                                  static_cast<int32_t>(conv_transpose_attrs_.group),
-                                 reinterpret_cast<int32_t>(local_kernel_shape.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_pads.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_strides.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_strides.data()) >> 2,
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     }
   }
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index 19168f40b4722..bf808be949cf8 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase {
                                static_cast<int32_t>(mode_),
                                static_cast<double>(value_),
                                gsl::narrow_cast<int32_t>(pads.size()),
-                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index a5a4aa834c2ca..95c4f2bec230d 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -8,29 +8,29 @@
 
 namespace onnxruntime {
 namespace js {
-#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                              \
-  template <bool allow_multi_axes = true>                                                                    \
-  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                          \
-   public:                                                                                                   \
-    using ReduceKernelBase<allow_multi_axes>::axes_;                                                         \
-    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                         \
-    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                     \
-    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) {      \
-      std::vector<int32_t> axes(axes_.size());                                                               \
-      if (axes_.size() > 0) {                                                                                \
-        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                             \
-                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                        \
-      }                                                                                                      \
-      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                            \
-                                   "keepDims" : !!$1,                                                        \
-                                   "noopWithEmptyAxes" : !!$2,                                               \
-                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],            \
-                                 }),                                                                         \
-                                 static_cast<int32_t>(keepdims_),                                            \
-                                 static_cast<int32_t>(noop_with_empty_axes_),                                \
-                                 gsl::narrow_cast<int32_t>(axes.size()),                                     \
-                                 reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
-    }                                                                                                        \
+#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                         \
+  template <bool allow_multi_axes = true>                                                               \
+  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                     \
+   public:                                                                                              \
+    using ReduceKernelBase<allow_multi_axes>::axes_;                                                    \
+    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                    \
+    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                \
+    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) { \
+      std::vector<int32_t> axes(axes_.size());                                                          \
+      if (axes_.size() > 0) {                                                                           \
+        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                        \
+                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                   \
+      }                                                                                                 \
+      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                       \
+                                   "keepDims" : !!$1,                                                   \
+                                   "noopWithEmptyAxes" : !!$2,                                          \
+                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],       \
+                                 }),                                                                    \
+                                 static_cast<int32_t>(keepdims_),                                       \
+                                 static_cast<int32_t>(noop_with_empty_axes_),                           \
+                                 gsl::narrow_cast<int32_t>(axes.size()),                                \
+                                 JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);        \
+    }                                                                                                   \
   };
 
 JSEP_DEFINE_REDUCE_KERNEL(ReduceMax);
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 65854222ba988..4b1c288ae3015 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase {
                                }),
                                static_cast<int32_t>(antialias_),
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2,
                                resize_coordinate_transformation_mode.c_str(),
                                static_cast<double>(cubic_coeff_a_),
                                static_cast<int32_t>(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 6792997025d65..989adabf029a5 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase {
                                         "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
                                         "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
                                gsl::narrow_cast<int32_t>(starts.size()),
-                               reinterpret_cast<int32_t>((starts.size() > 0) ? starts.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(ends.size()),
-                               reinterpret_cast<int32_t>((ends.size() > 0) ? ends.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h
index cfacc1aa6a363..1c1874e5aa98e 100644
--- a/onnxruntime/core/providers/js/operators/split.h
+++ b/onnxruntime/core/providers/js/operators/split.h
@@ -53,7 +53,7 @@ class Split : public JsKernel, public SplitBase {
                                static_cast<int32_t>(axis_),
                                static_cast<int32_t>(num_outputs_),
                                gsl::narrow_cast<int32_t>(split_sizes.size()),
-                               reinterpret_cast<int32_t>((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index 311badbde0d11..dae442b9f5a13 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -27,7 +27,7 @@ class Transpose final : public JsKernel, public TransposeBase {
                                gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
                                // $2: index to HEAP32 of the first int32 element. calculated from right shift memory
                                //     address by 2
-                               reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index 35bd38d818979..b19d9ab0f66d0 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -56,6 +56,14 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
 
+  bool GetVariadicInputHomogeneity() const {
+    return false;  // heterogenous
+  }
+
+  bool GetVariadicOutputHomogeneity() const {
+    return false;  // heterogeneous
+  }
+
  private:
   const char* provider_{onnxruntime::kTensorrtExecutionProvider};
   void* compute_stream_;
diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index d34cb7e362446..7718fbdc2df88 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -12,6 +12,24 @@
 namespace onnxruntime {
 namespace webnn {
 
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer) {
+  InitializedTensorSet all_initializers;
+  if (graph_viewer.IsSubgraph()) {
+    const Graph* cur_graph = &graph_viewer.GetGraph();
+    // Traverse up to the top-level graph, collecting all initializers.
+    while (cur_graph->IsSubgraph()) {
+      const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+      all_initializers.insert(current_initializers.begin(), current_initializers.end());
+      cur_graph = cur_graph->ParentGraph();
+    }
+    // Collect initializers in top-level graph.
+    const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+    all_initializers.insert(current_initializers.begin(), current_initializers.end());
+  }
+
+  return all_initializers;
+}
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
   const auto* shape_proto = node_arg.Shape();
   if (!shape_proto) {
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 8b8b85339a87c..ea57ab1af19af 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -35,6 +35,9 @@ typedef struct {
   bool isCpuSupported;  // The WebNN CPU backend XNNPack supports it (not about the CPU EP).
 } WebnnOpInfo;
 
+// Collects all the initializer tensors in the subGraph and its ancestor graphs.
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
 
 template <typename T>
@@ -139,7 +142,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"ArgMax", {"argMax", false}},
     {"ArgMin", {"argMin", false}},
     {"AveragePool", {"averagePool2d", true}},
-    {"BatchNormalization", {"meanVarianceNormalization", false}},
+    {"BatchNormalization", {"batchNormalization", false}},
     {"Cast", {"cast", false}},
     {"Ceil", {"ceil", true}},
     {"Clip", {"clamp", true}},
@@ -162,12 +165,11 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"GlobalLpPool", {"l2Pool2d", false}},
     {"Greater", {"greater", false}},
     {"GreaterOrEqual", {"greaterOrEqual", false}},
-    {"GroupNormalization", {"meanVarianceNormalization", false}},
     {"HardSigmoid", {"hardSigmoid", false}},
     {"HardSwish", {"hardSwish", true}},
     {"Identity", {"identity", false}},
-    {"InstanceNormalization", {"meanVarianceNormalization", false}},
-    {"LayerNormalization", {"meanVarianceNormalization", false}},
+    {"InstanceNormalization", {"instanceNormalization", false}},
+    {"LayerNormalization", {"layerNormalization", false}},
     {"LeakyRelu", {"leakyRelu", true}},
     {"Less", {"lesser", false}},
     {"LessOrEqual", {"lesserOrEqual", false}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index 756a838cc0c3e..4d2470dfe7deb 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
-// All normalization are based on layout NCHW.
-// TODO: add support for NHWC.
 Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                      const Node& node,
                                                      const logging::Logger& logger) const {
@@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape.");
   }
 
-  std::vector<uint32_t> new_scale_shape;
-  if (scale_size < rank) {
-    if (op_type == "BatchNormalization") {
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), rank - 2, 1);
-    } else if (op_type == "LayerNormalization") {
-      // Align right with leading ones.
-      scale_shape.insert(scale_shape.begin(), rank - scale_size, 1);
-    } else if (op_type == "InstanceNormalization") {
-      // Insert ones before and after the channel dimension.
-      scale_shape.insert(scale_shape.begin(), 1);
-      ORT_RETURN_IF(scale_size != 1 || rank < 2,
-                    "The scale size should be 1 and rank should be at least 2 for InstanceNorm.");
-      scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1);
-    } else if (op_type == "GroupNormalization") {
-      // The input will be reshaped to 3D later. So just insert ones before the channel and after.
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), 1);
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
-    }
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  options.set("scale", scale);
 
-    std::transform(scale_shape.cbegin(), scale_shape.cend(),
-                   std::back_inserter(new_scale_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name());
-    emscripten::val reshape_output_scale =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_scale, emscripten::val::array(new_scale_shape));
-    options.set("scale", reshape_output_scale);
-
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name());
-      emscripten::val reshape_output_bias =
-          model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_bias, emscripten::val::array(new_scale_shape));
-      options.set("bias", reshape_output_bias);
-    }
-  } else {
-    options.set("scale", model_builder.GetOperand(input_defs[1]->Name()));
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      options.set("bias", model_builder.GetOperand(input_defs[2]->Name()));
-    }
+  if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
+    // Bias input exists, and bias's shape is the same as scale's shape.
+    emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name());
+    options.set("bias", bias);
   }
 
   NodeAttrHelper helper(node);
@@ -114,56 +76,62 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs.");
     emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name());
     emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name());
-    // Enlarge 1-D mean and variance to new scale shape.
-    emscripten::val reshape_mean =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", mean, emscripten::val::array(new_scale_shape));
-    emscripten::val reshape_variance =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", variance, emscripten::val::array(new_scale_shape));
-
-    std::vector<uint32_t> axes = {0};
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.push_back(i);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("axis", rank - 1);
     }
-
-    options.set("axes", emscripten::val::array(axes));
-    options.set("mean", reshape_mean);
-    options.set("variance", reshape_variance);
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("batchNormalization", input, mean, variance, options);
   } else if (op_type == "LayerNormalization") {
     int64_t axis = helper.Get("axis", -1);
     axis = HandleNegativeAxis(axis, rank);
     std::vector<uint32_t> axes(rank - SafeInt<uint32_t>(axis));
-    std::iota(axes.begin(), axes.end(), axis);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) {
+      std::iota(axes.begin(), axes.end(), axis - 1);
+    } else {
+      std::iota(axes.begin(), axes.end(), axis);
+    }
     options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("layerNormalization", input, options);
   } else if (op_type == "InstanceNormalization") {
-    std::vector<uint32_t> axes;
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.emplace_back(i);
+    // WebNN spec only supports 4D input for instanceNormalization.
+    // Supports 3D input by prepending 1 size dimension.
+    // For models with dimensions greater than 4, they will be reshaped into 4D.
+    constexpr size_t webnn_shape_rank = 4;
+    if (input_shape.size() != webnn_shape_rank) {
+      std::vector<uint32_t> new_shape;
+      new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank));
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(new_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+
+      size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3;
+      ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank;
+      auto insertion_point = new_shape.begin() + insertion_offset;
+      if (input_shape.size() < webnn_shape_rank) {
+        // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements.
+        new_shape.insert(insertion_point, -excess_rank, 1);
+      } else {
+        // Fold the extra range to fit within WebNN v1's rank requirements.
+        uint32_t sum = std::accumulate(
+            insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies<uint32_t>());
+        new_shape.erase(insertion_point, insertion_point + excess_rank);
+        *insertion_point = sum;
+      }
+      input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+    }
+
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("layout", emscripten::val("nhwc"));
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("instanceNormalization", input, options);
+    // Reshape back to the original output shape for 3D input.
+    if (input_shape.size() != 4) {
+      std::vector<uint32_t> output_shape;
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(output_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+      output = model_builder.GetBuilder().call<emscripten::val>(
+          "reshape", output, emscripten::val::array(output_shape));
     }
-    options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
-  } else if (op_type == "GroupNormalization") {
-    ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided.");
-    int32_t group_count = helper.Get("num_groups", -1);
-    std::vector<uint32_t> orig_shape, new_shape;
-    std::transform(input_shape.cbegin(), input_shape.cend(),
-                   std::back_inserter(orig_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    // Add N and Group.
-    ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D");
-    new_shape.emplace_back(SafeInt<uint32_t>(input_shape[0]));
-    new_shape.emplace_back(SafeInt<uint32_t>(group_count));
-
-    ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0,
-                      "GroupNormalization num_group must be divisible by group.");
-    new_shape.emplace_back(SafeInt<uint32_t>(std::reduce(input_shape.begin() + 2, input_shape.end(),
-                                                         input_shape[1] / group_count, std::multiplies<int64_t>())));
-    // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization.
-    options.set("axes", emscripten::val::array(std::vector<uint32_t>{2}));
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", output, options);
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(orig_shape));
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
   }
@@ -214,7 +182,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat
   constexpr static std::string_view op_types[] =
       {
           "BatchNormalization",
-          "GroupNormalization",
           "InstanceNormalization",
           "LayerNormalization",
       };
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index ea9fc379ee23f..186d1e7c1035a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
   // We only support Resize opset 11+ here.
@@ -161,7 +161,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
-                                        const WebnnDeviceType /* device_type */,
+                                        const WebnnDeviceType device_type,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -181,9 +181,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
-      return false;
+    // WebNN CPU backend only supports "linear" mode.
+    // WebNN GPU backend only supports "linear" and "nearest" modes.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (!is_linear_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend.";
+        return false;
+      }
+    } else {
+      if (!is_linear_resize && !is_nearest_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend.";
+        return false;
+      }
     }
 
     const auto exclude_outside = helper.Get("exclude_outside", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index b6631263dfb93..b57e1b89b0af0 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -38,6 +38,25 @@ Status ModelBuilder::Initialize() {
   return Status::OK();
 }
 
+InitializedTensorSet ModelBuilder::GetInitializerTensors() {
+  if (graph_viewer_.IsSubgraph()) {
+    auto all_initializers = CollectAllInitializedTensors(graph_viewer_);
+    const auto sub_graph_id = graph_viewer_.GetFilterInfo();
+    const auto subgraph_initializer_names = sub_graph_id->GetMetaDef()->constant_initializers;
+    InitializedTensorSet subgraph_initializers;
+
+    for (const auto& name : subgraph_initializer_names) {
+      auto it = all_initializers.find(name);
+      if (it != all_initializers.end()) {
+        subgraph_initializers.insert(*it);
+      }
+    }
+    return subgraph_initializers;
+  } else {
+    return graph_viewer_.GetAllInitializedTensors();
+  }
+}
+
 /* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) {
   const auto& op_builders = GetOpBuilders();
   const auto it = op_builders.find(node.OpType());
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index c381eef3f42f7..16c8bf2d3c77f 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -30,7 +30,7 @@ class ModelBuilder {
 
   // Accessors for members.
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
-  const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); }
+  InitializedTensorSet GetInitializerTensors();
 
   const emscripten::val& GetBuilder() const { return wnn_builder_; }
   const emscripten::val& GetContext() const { return wnn_context_; }
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 463317a4dafda..613771eda71fe 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -111,7 +111,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Normalization
     CreateNormalizationOpBuilder("BatchNormalization", op_registrations);
-    CreateNormalizationOpBuilder("GroupNormalization", op_registrations);
     CreateNormalizationOpBuilder("InstanceNormalization", op_registrations);
     CreateNormalizationOpBuilder("LayerNormalization", op_registrations);
   }
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 4da54aaad3a33..cf18b3225eb47 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -59,10 +59,15 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
                                       const IKernelLookup& /*kernel_registries*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  // We do not run WebNN EP on subgraph, instead we cover this in the control flow nodes.
-  // TODO investigate whether we want to support subgraph using WebNN EP.
-  if (graph_viewer.IsSubgraph()) {
-    return result;
+  // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its
+  // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for
+  // identifying the required initializer names and storing into 'meta_def->constant_initializers'.
+  // Thus we are able to get the required initialized tensors for this subgraph via the GetInitializerTensors()
+  // method defined in the model_builder.h file.
+  InitializedTensorSet all_initializers;
+  const bool is_subgraph = graph_viewer.IsSubgraph();
+  if (is_subgraph) {
+    all_initializers = webnn::CollectAllInitializedTensors(graph_viewer);
   }
 
   /*
@@ -110,6 +115,7 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 
     std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
 
+    std::vector<std::string> subgraph_initializers;
     InlinedHashSet<const NodeArg*> node_outputs;
     InlinedHashSet<const NodeArg*> subgraph_inputs;
     InlinedHashSet<const NodeArg*> subgraph_outputs;
@@ -126,7 +132,11 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
           // skip the placeholder inputs.
           continue;
         }
-        // if the node input was not produced by this subgraph, add it to the subgraph inputs.
+        // If it is a subgraph of a control flow node, collect the constant initializer.
+        if (is_subgraph && Contains(all_initializers, input->Name())) {
+          subgraph_initializers.push_back(input->Name());
+        }
+        // If the node input was not produced by this subgraph, add it to the subgraph inputs.
         if (node_outputs.count(input) == 0) {
           if (subgraph_inputs.count(input) == 0) {
             subgraph_inputs.insert(input);
@@ -165,6 +175,12 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
     meta_def->since_version = 1;
     meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
 
+    if (is_subgraph) {
+      for (const auto& initializer : subgraph_initializers) {
+        meta_def->constant_initializers.push_back(initializer);
+      }
+    }
+
     for (const auto& input : ordered_subgraph_inputs) {
       meta_def->inputs.push_back(input->Name());
     }
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index eea675eb0193a..984fdd6bce325 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -24,6 +24,7 @@
 #include "core/session/custom_ops.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 static constexpr uint32_t min_ort_version_with_optional_io_support = 8;
@@ -380,6 +381,31 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont
   API_IMPL_END
 };
 
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) {
+  API_IMPL_BEGIN
+  if (!context) {
+    return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context");
+  }
+  if (fn && total) {
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
+    auto* tp = ctx->GetOperatorThreadPool();
+    if (num_batch) {
+      onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); },
+          static_cast<std::ptrdiff_t>(num_batch));
+    } else {
+      onnxruntime::concurrency::ThreadPool::TrySimpleParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); });
+    }
+  }
+  return nullptr;
+  API_IMPL_END
+};
+
 #ifdef _WIN32
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 76a8a778025e1..08bfb618f55b4 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2722,6 +2722,7 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::SetSymbolicDimensions,
     &OrtApis::ReadOpAttr,
     &OrtApis::SetDeterministicCompute,
+    &OrtApis::KernelContext_ParallelFor,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index c9e4074a1afe2..6df5e4145b416 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -502,4 +502,6 @@ ORT_API_STATUS_IMPL(SetSymbolicDimensions, _In_ OrtTensorTypeAndShapeInfo* info,
 ORT_API_STATUS_IMPL(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
 ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
 
+ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data);
+
 }  // namespace OrtApis
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 9f90196e301e5..3e9f9a6544a71 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -4,18 +4,23 @@
 # license information.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
 import argparse
+import copy
+import importlib
 import logging
 import os
-from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
 from .quant_utils import attribute_to_kwarg
 
@@ -23,19 +28,101 @@
 logger = logging.getLogger(__name__)
 
 
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+        """
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        blocksize=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            blocksize (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+        """
+        super().__init__(
+            algorithm="GPTQ",
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.blocksize = blocksize
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
-    def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None):
+    def __init__(
+        self,
+        model: ModelProto | str,
+        block_size: int,
+        is_symmetric: bool,
+        accuracy_level: int | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
+    ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
-        self.model = ONNXModel(model)
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
         self.block_size = block_size
         self.is_symmetric = is_symmetric
+        self.accuracy_level = accuracy_level
         self.nodes_to_exclude = set(nodes_to_exclude)
+        self.algo_config = algo_config
 
     @staticmethod
-    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
+    def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
         for gid in range(len(graph_path) - 1, -1, -1):
             graph = graph_path[gid]
             for tensor in graph.initializer:
@@ -66,7 +153,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
 
         return (packed, scales, zero_point)
 
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
+    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
         """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
 
         if node.op_type != "MatMul":
@@ -113,6 +200,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
         kwargs["N"] = cols
         kwargs["bits"] = 4
         kwargs["block_size"] = self.block_size
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
 
         matmul_q4_node = onnx.helper.make_node(
             "MatMulNBits",
@@ -127,7 +216,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
 
         return matmul_q4_node
 
-    def _process_subgraph(self, graph_stack: List[GraphProto]):
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
         new_nodes = []
         graph = graph_stack[-1]
 
@@ -165,20 +254,99 @@ def _process_subgraph(self, graph_stack: List[GraphProto]):
         graph_stack.pop()
         return graph
 
-    def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
 
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.blocksize
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
 
-        self._process_subgraph(graph_stack)
-        self.model.clean_initializers()
+    def process(self):
+        if self.algo_config is None:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+            opset_import = self.model.opset_import()
+
+            has_ms_domain = False
+            for opset in opset_import:
+                if opset.domain == "com.microsoft":
+                    has_ms_domain = True
+            if not has_ms_domain:
+                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
 
 
 def parse_args():
@@ -201,6 +369,14 @@ def parse_args():
         type=bool,
         help="Indicate whether to quantize the model symmetrically",
     )
+    parser.add_argument(
+        "--accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)
     parser.add_argument(
@@ -228,6 +404,12 @@ def parse_args():
         raise Exception(f"file {output_model_path} already exists")
 
     model = onnx.load(input_model_path)
-    quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude)
+    quant = MatMul4BitsQuantizer(
+        model=model,
+        block_size=args.block_size,
+        is_symmetric=args.symmetric,
+        accuracy_level=args.accuracy_level,
+        nodes_to_exclude=args.nodes_to_exclude,
+    )
     quant.process()
     quant.model.save_model_to_file(output_model_path, True)
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index aed46563c2764..1bd2ef42151d0 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -466,7 +466,6 @@ def quantize_static(
 
         import copy
 
-        import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
 
         def inc_dataloader():
@@ -478,13 +477,11 @@ def inc_dataloader():
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
         del dataloader
-        model = sq.transform(
-            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
-        ).model
-        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
-        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
-        onnx.save_model(model, model_input, save_as_external_data=True)
+        model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index e694b5050cc8c..bc09b52574a27 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -1,9 +1,10 @@
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import shutil
 from itertools import chain
-from typing import List
 
 import onnx
 import torch
@@ -21,11 +22,12 @@
 from onnxruntime import quantization as ort_quantization
 from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
 
+torch_export_onnx_opset_version = 14
 logger = logging.getLogger("")
 init_dist()
 
 
-def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in input_names:
@@ -42,7 +44,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
     return dynamic_axes
 
 
-def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -65,7 +67,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li
     return dynamic_axes
 
 
-def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -229,7 +231,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -288,7 +290,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -368,7 +370,7 @@ def run_torchscript_merged_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -412,7 +414,7 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
 
 
 def convert_to_float16(
-    args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1
 ):
     decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx")
     decoder_with_past_model_fp16_path = os.path.join(
@@ -635,7 +637,7 @@ def get_args():
         help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
     )
 
-    blockwise_group = parser.add_argument_group("4-bit quantization")
+    blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)")
 
     blockwise_group.add_argument(
         "--block_size",
@@ -645,6 +647,15 @@ def get_args():
         help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
     )
 
+    blockwise_group.add_argument(
+        "--int4_accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
+
     smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)")
 
     smooth_quant_group.add_argument(
@@ -937,7 +948,13 @@ def main():
                 for fp_path, int4_path in zip(old_paths, new_paths):
                     if os.path.exists(fp_path):
                         model = onnx.load_model(fp_path, load_external_data=True)
-                        quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[])
+                        quant = MatMul4BitsQuantizer(
+                            model=model,
+                            block_size=args.block_size,
+                            is_symmetric=True,
+                            accuracy_level=args.int4_accuracy_level,
+                            nodes_to_exclude=[],
+                        )
                         quant.process()
                         quant.model.save_model_to_file(int4_path, use_external_data_format=True)
                         del model
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index bae1ae82e8f7e..a329b73259dda 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from __future__ import annotations
 
 import numpy as np
 import torch
@@ -235,7 +235,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u
 
 
 # Convert list of past_key_values to dict of past_key and past_value
-def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]):
+def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]):
     past_kv = {}
     for i, (past_k, past_v) in enumerate(past_key_values):
         past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy()
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index 418a65325c8f0..25d7519769604 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,8 +1,9 @@
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import time
-from typing import List
 
 import numpy as np
 import torch
@@ -139,7 +140,7 @@ def verify_parity(
     return kv_cache_ortvalues
 
 
-def get_args(argv: List[str]):
+def get_args(argv: list[str]):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -232,7 +233,7 @@ def get_args(argv: List[str]):
     return args
 
 
-def main(argv: List[str] = []):  # noqa: B006
+def main(argv: list[str] = []):  # noqa: B006
     args = get_args(argv)
     setup_logger(args.verbose)
     logger.info(f"Arguments: {args}")
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index d9c870a7dc52a..6afb61bd1f0a1 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -738,10 +738,23 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
 
         tester.AddOutput<float>("present", past_dims, present);
 
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        // Run - Regular kernel execution path
+        {
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
+
+        // Test alternate kernel path of loading more KV data "in flight"
+        {
+          ScopedEnvironmentVariables scoped_env_vars{
+              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
       }
     }
   }
@@ -852,10 +865,22 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 
         tester.AddOutput<MLFloat16>("present", past_dims, present);
 
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        // Run - Regular kernel execution path
+        {
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
+
+        // Test alternate kernel path of loading more KV data "in flight"
+        {
+          ScopedEnvironmentVariables scoped_env_vars{
+              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
       }
     }
   }
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 486ec37d1eebd..2522ee3b496f6 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -578,6 +578,9 @@ TEST(InferenceSessionTests, ModelMetadata) {
 }
 #endif
 TEST(InferenceSessionTests, CheckRunLogger) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "CheckRunLogger";
@@ -837,6 +840,9 @@ TEST(InferenceSessionTests, PreAllocateOutputVector) {
 }
 
 TEST(InferenceSessionTests, ConfigureVerbosityLevel) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "ConfigureVerbosityLevel";
@@ -2661,6 +2667,9 @@ class InferenceSessionTestSharingAllocator : public InferenceSessionWrapper {
 
 // Ensure sessions use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2706,6 +2715,9 @@ TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllo
 
 // Ensure sessions don't use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsDontUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2758,6 +2770,9 @@ class InferenceSessionTestSharingInitializer : public InferenceSessionWrapper {
 };
 
 TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitializer) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2942,6 +2957,9 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) {
 
 // test inter thread pool with setting denormal as zero
 TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // test if denormal-as-zero mode is supported
   if (!SetDenormalAsZero(false)) {
     return;
diff --git a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
index 7a67747f7cf4c..89ffb8ec87dcb 100644
--- a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
+++ b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
@@ -234,4 +234,44 @@ TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodes) {
   EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 4, OpCount(op_count_after, "DequantizeLinear"));
 }
 
+TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodesPreservingAttributes) {
+  constexpr auto model_uri = ORT_TSTR("testdata/qdq_with_multi_consumer_q_dq_axis.onnx");
+
+  SessionOptions session_options{};
+  // test interaction with level 1 transformers
+  session_options.graph_optimization_level = TransformerLevel::Level1;
+
+  InferenceSessionWrapper session{session_options, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session.Load(model_uri));
+
+  const auto op_count_before = CountOpsInGraph(session.GetGraph());
+
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const auto op_count_after = CountOpsInGraph(session.GetGraph());
+
+  EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 8, OpCount(op_count_after, "DequantizeLinear"));
+
+  int64_t given_axis = 0;  // all the following 4 DQ nodes and their duplicated one should have axis = 0
+  std::string axis_dq_name0 = "Convolution28_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name1 = "Parameter5/DequantizeLinear";
+  std::string axis_dq_name2 = "Convolution110_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name3 = "Parameter87/DequantizeLinear";
+  for (const auto& node : session.GetGraph().Nodes()) {
+    if (node.OpType() == "DequantizeLinear") {
+      if (node.Name().find(axis_dq_name0) == 0 ||
+          node.Name().find(axis_dq_name1) == 0 ||
+          node.Name().find(axis_dq_name2) == 0 ||
+          node.Name().find(axis_dq_name3) == 0) {
+        const auto& attrs = node.GetAttributes();
+        ASSERT_TRUE(attrs.find("axis") != attrs.end());
+        const auto& axis_attr = attrs.at("axis");
+        int64_t axis = axis_attr.i();
+        EXPECT_EQ(axis, given_axis);
+      }
+    }
+  }
+}
+
 }  // namespace onnxruntime::test
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index ac25c98b15758..13082fe69cf48 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -170,6 +170,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     const auto& api = Ort::GetApi();
     OrtTensorRTProviderOptionsV2* tensorrt_options;
     Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+    std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+        tensorrt_options, api.ReleaseTensorRTProviderOptions);
     std::vector<const char*> option_keys, option_values;
     // used to keep all option keys and value strings alive
     std::list<std::string> buffer;
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 7ec9e0f345187..ddb0a6620619c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -588,6 +588,9 @@ TEST_F(ActivationOpTest, Softplus) {
 }
 
 TEST_F(ActivationOpNoInfTest, Softsign) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1, which exceeds threshold";
diff --git a/onnxruntime/test/providers/cpu/text/string_concat_test.cc b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
new file mode 100644
index 0000000000000..2bfa3dc5615e1
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
@@ -0,0 +1,76 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(const std::vector<int64_t>& dims, const std::vector<std::string>& input1,
+                    const std::vector<std::string>& input2, const std::vector<std::string>& output) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", dims, input1);
+  test.AddInput<std::string>("Y", dims, input2);
+  test.AddOutput<std::string>("Z", dims, output);
+  test.Run();
+}
+
+TEST(StringConcat, BasicConcatenation) {
+  RunTest({1, 2}, {"Hello", "World"}, {"Hello", "World"}, {"HelloHello", "WorldWorld"});
+}
+
+TEST(StringConcat, TwoDimensionalConcatenation) {
+  RunTest({2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}, {"Hello", "World", "ONNX", "onnxruntime"},
+          {"HelloHello", "WorldWorld", "ONNXONNX", "onnxruntimeonnxruntime"});
+}
+
+TEST(StringConcat, LeftToRightBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddInput<std::string>("Y", {1}, {"!"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"Hello!", "World!", "ONNX!", "onnxruntime!"});
+  test.Run();
+}
+
+TEST(StringConcat, RightToLeftBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {1}, {"!"});
+  test.AddInput<std::string>("Y", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"!Hello", "!World", "!ONNX", "!onnxruntime"});
+  test.Run();
+}
+
+TEST(StringConcat, BidirectionalBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 1, 3}, {"a", "b", "c", "d", "e", "f"});
+  test.AddInput<std::string>("Y", {1, 4, 3}, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m"});
+  test.AddOutput<std::string>("Z", {2, 4, 3},
+                              {
+                                  "aa",
+                                  "bb",
+                                  "cc",
+                                  "ad",
+                                  "be",
+                                  "cf",
+                                  "ag",
+                                  "bh",
+                                  "ci",
+                                  "ak",
+                                  "bl",
+                                  "cm",
+                                  "da",
+                                  "eb",
+                                  "fc",
+                                  "dd",
+                                  "ee",
+                                  "ff",
+                                  "dg",
+                                  "eh",
+                                  "fi",
+                                  "dk",
+                                  "el",
+                                  "fm",
+                              });
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 02f51cc4fa809..73dae08af8ece 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -71,13 +71,16 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non
         output_name = "output"
         initializers = []
 
-        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+        def make_matmul(
+            input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str
+        ):
             weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
             initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
             return onnx.helper.make_node(
                 "MatMul",
                 [input_name, weight_name],
                 [output_name],
+                node_name,
             )
 
         in_features = 52
@@ -88,6 +91,7 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
             [in_features, out_features],
             "linear1.weight",
             output_name,
+            "MatMul_0",
         )
 
         # make graph
@@ -139,6 +143,48 @@ def quant_test(
             else:
                 raise exception
 
+    def quant_test_with_algo(
+        self,
+        algorithm: str,
+        model_fp32_path: str,
+        data_reader: TestDataFeeds,
+        block_size: int,
+        is_symmetric: bool,
+    ):
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to int4 model
+        from onnxruntime.quantization import matmul_4bits_quantizer
+
+        algo_config = None
+        if algorithm == "RTN":
+            # test RTN algorithm
+            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+        elif algorithm == "GPTQ":
+            # test GPTQ algorithm
+            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
+        quant.process()
+        quant.model.save_model_to_file(model_int4_path, False)
+
+        quant_nodes = {"MatMulNBits": 1}
+        check_op_type_count(self, model_int4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
+        except Exception as exception:
+            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
+                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
+                pass
+            else:
+                raise exception
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
@@ -159,6 +205,28 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_rtn_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_gptq_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
index 85edfa0e59f1d..ebef441350d4c 100644
--- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
@@ -49,16 +49,45 @@ struct KernelOne {
   }
 };
 
+struct DataI {
+  const float* from = {};
+  float* to = {};
+};
+
+struct DataII {
+  const float* from = {};
+  int32_t* to = {};
+};
+
+// floats to floats
+void CopyI(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataI*>(raw_data);
+  data->to[ith] = data->from[ith];
+}
+
+// floats to int32_t
+void CopyII(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataII*>(raw_data);
+  data->to[ith] = static_cast<int32_t>(round(data->from[ith]));
+}
+
 // lite custom op as a function
-void KernelTwo(const Ort::Custom::Tensor<float>& X,
+void KernelTwo(OrtKernelContext* context,
+               const Ort::Custom::Tensor<float>& X,
                Ort::Custom::Tensor<int32_t>& Y) {
   const auto& shape = X.Shape();
   auto X_raw = X.Data();
   auto Y_raw = Y.Allocate(shape);
+  std::vector<float> floats(static_cast<size_t>(X.NumberOfElement()), 0.f);
+
+  DataI data_i = {X_raw, floats.data()};
   auto total = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
-  for (int64_t i = 0; i < total; i++) {
-    Y_raw[i] = static_cast<int32_t>(round(X_raw[i]));
-  }
+
+  Ort::KernelContext ctx(context);
+  ctx.ParallelFor(CopyI, static_cast<size_t>(total), 0, &data_i);  // test simple parallel for
+
+  DataII data_ii = {floats.data(), Y_raw};
+  ctx.ParallelFor(CopyII, static_cast<size_t>(total), 2, &data_ii);  // test batch parallel for
 }
 
 template <typename T>
diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx
index 8116ec3380645..3a43a7378a912 100644
Binary files a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx and b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx differ
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 3db497fa92315..c2ca5f860a107 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -248,11 +248,6 @@
         "^test_image_decoder_decode_pnm_rgb",
         "^test_image_decoder_decode_tiff_rgb",
         "^test_image_decoder_decode_webp_rgb",
-        "^test_string_concat_broadcasting",
-        "^test_string_concat",
-        "^test_string_concat_empty_string",
-        "^test_string_concat_utf8",
-        "^test_string_concat_zero_dimensional",
         "^test_string_split_basic",
         "^test_string_split_consecutive_delimiters",
         "^test_string_split_empty_string_delimiter",
diff --git a/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx
new file mode 100644
index 0000000000000..4f575ebb2841a
Binary files /dev/null and b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx differ
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 9b77832abb6f1..3fbdd5da7b768 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -485,12 +485,15 @@ void ListAllCombinations(const InlinedVector<InlinedVector<InlinedVector<std::sh
     return;
   }
 
-  for (const auto& plans : all_possible_node_optimization_plans[index]) {
-    for (const auto& plan : plans) {
-      InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
-      new_combination.push_back(plan);
-      ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
-    }
+  const InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>&
+      plan_combination_list_at_cur_index = all_possible_node_optimization_plans[index];
+  // For the index-th reused buffer, iterate all possible complete plans.
+  for (size_t i = 0; i < plan_combination_list_at_cur_index.size(); ++i) {
+    const auto& plan_combination = plan_combination_list_at_cur_index[i];
+    InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
+    // Append the chosen complete plan and continue exploring the next reused buffer by index + 1.
+    new_combination.insert(new_combination.end(), plan_combination.begin(), plan_combination.end());
+    ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
   }
 
   MO_LOG_DEBUG_INFO(logger, "Exit ListAllCombinations");
@@ -520,17 +523,28 @@ void IterateNodeOptimizationPlan(const std::shared_ptr<NodeOptimizationPlanBase>
   }
 
   InlinedVector<InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>>
-      all_possible_node_optimization_plans;
-  all_possible_node_optimization_plans.resize(plan->reuse_buffers.size());
+      all_possible_node_optimization_plans(plan->reuse_buffers.size());
 
   size_t i = 0;
   for (const auto& p : plan->reuse_buffers) {
     MO_LOG_DEBUG_INFO(logger, ">>>reuse buffer: " + std::to_string(p.first));
-    IterateNode(p.second.first, node_to_optimization_plans_map, {}, logger, all_possible_node_optimization_plans[i]);
+    // If the resued node is part of current node optimization plan, then we just add current combination to the result.
+    if (plan->GetOptimizationType() == OptimizationType::RecomputeWithCompromise || plan->GetOptimizationType() == OptimizationType::Recompute) {
+      const auto& recompute_subgraph =
+          dynamic_cast<NodeRecomputePlan*>(plan.get())->GetNodesInTopoOrder();
+      if (std::find(recompute_subgraph.begin(), recompute_subgraph.end(), p.second.first) != recompute_subgraph.end()) {
+        all_possible_node_optimization_plans[i].push_back(current_combination);
+      }
+    }
+
+    if (all_possible_node_optimization_plans[i].size() == 0) {
+      IterateNode(p.second.first, node_to_optimization_plans_map, current_combination, logger, all_possible_node_optimization_plans[i]);
+    }
+
     ++i;
   }
 
-  ListAllCombinations(all_possible_node_optimization_plans, 0, current_combination, logger, all_combinations);
+  ListAllCombinations(all_possible_node_optimization_plans, 0, {}, logger, all_combinations);
 
   MO_LOG_DEBUG_INFO(logger, "Exit IterateNodeOptimizationPlan: " + plan->GetClusterId());
 }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
index 64e99a4a0bca5..4ce896c5350b0 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
@@ -15,35 +15,6 @@
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
-std::string NodeOptimizationPlanBase::GetMemorySavingSymbolicString() const {
-  std::string saving_str;
-  for (auto output_index : activation_output_indices_) {
-    // If the output is reusing other node's buffer, then no memory saving.
-    if (reuse_buffers.find(output_index) != reuse_buffers.end()) {
-      continue;
-    }
-
-    const auto& output_def = node->OutputDefs()[output_index];
-    MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
-    ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
-                DataTypeImpl::ToString(ml_data_type));
-    const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
-    ORT_ENFORCE(nullptr != tensor_type_base);
-    MLDataType elt_type = tensor_type_base->GetElementType();
-    const auto byte_count_per_element = elt_type->Size();
-    if (!saving_str.empty()) {
-      saving_str += " + ";
-    }
-    saving_str = "(" + GetActivationOutputDimParamString(output_index) + " * " +
-                 std::to_string(byte_count_per_element) + " * " +
-                 std::to_string(GetSaveRatio()) + ")";
-  }
-  if (saving_str.empty()) {
-    return saving_str;
-  }
-  return "(" + saving_str + ")";
-}
-
 Status MemoryOptimizationPlanner::UpdateNodePlansFromExecutionPlan(const GraphViewer& graph_viewer,
                                                                    const OrtValueNameIdxMap& ortvalue_name_to_idx_map,
                                                                    const SequentialExecutionPlan& p_seq_exec_plan) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
index c585b2810b39d..789f530b29f1d 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
@@ -83,7 +83,7 @@ class NodeOptimizationPlanBase {
   /**
    * Get a symbolic string to represent the memory saving for this optimization plan.
    */
-  std::string GetMemorySavingSymbolicString() const;
+  virtual std::string GetMemorySavingSymbolicString() const = 0;
 
   std::string GetActivationOutputDimParamString(size_t index) const {
     ORT_ENFORCE(activation_output_dim_params_.find(index) != activation_output_dim_params_.end(),
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 52dea571a1eaf..12c83591c0036 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -72,12 +72,14 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
         {"Add", AllowedRecomputeNodeConfig{{0, 1}}},
         {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Div", AllowedRecomputeNodeConfig{{0, 1}}},
+        {"Equal", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Mul", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Sub", AllowedRecomputeNodeConfig{{0, 1}}},
 
         // Data layout
         /// The shape input is trivial whether it exists or not in backward.
         {"Reshape", AllowedRecomputeNodeConfig{{0}}},
+        {"Shape", AllowedRecomputeNodeConfig{{0}}},
         {"Squeeze", AllowedRecomputeNodeConfig{{0}}},
         {"Transpose", AllowedRecomputeNodeConfig{{0}}},
         {"Unsqueeze", AllowedRecomputeNodeConfig{{0}}},
@@ -92,6 +94,7 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
         {"Expand", AllowedRecomputeNodeConfig{{0}}},
         {"FastGelu", AllowedRecomputeNodeConfig{{0}}},
         {"Gelu", AllowedRecomputeNodeConfig{{0}}},
+        {"QuickGelu", AllowedRecomputeNodeConfig{{0}}},
 
         // Ternary elementwise
         {"Where", AllowedRecomputeNodeConfig{{0, 1, 2}}},
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index d9693835313b8..ab114d970191e 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -86,6 +86,51 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
 
   std::string GetNodesInTopoOrderStr() const;
 
+  std::string GetMemorySavingSymbolicString() const override {
+    std::string saving_str;
+    for (auto output_index : GetActivationOutputIndices()) {
+      // If the output is reusing other node's buffer, then no memory saving.
+      std::string cur_output_saving_str;
+
+      bool is_reused = reuse_buffers.find(output_index) != reuse_buffers.end();
+      bool is_src_node_in_cur_node_subgraph = false;
+      if (is_reused) {
+        // Here we assume the src_node is the real owner of the buffer, so we don't need trace further.
+        const auto* src_node = reuse_buffers.at(output_index).first;
+        is_src_node_in_cur_node_subgraph = std::find(nodes_in_topological_order_.begin(),
+                                                     nodes_in_topological_order_.end(),
+                                                     src_node) != nodes_in_topological_order_.end();
+      }
+
+      if (!is_reused || is_src_node_in_cur_node_subgraph) {
+        // For is_src_node_in_cur_node_subgraph is True, still use the output to calculate the saving, because
+        // reusing buffer is the same size.
+        const auto& output_def = node->OutputDefs()[output_index];
+        MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
+        ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
+                    DataTypeImpl::ToString(ml_data_type));
+        const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
+        ORT_ENFORCE(nullptr != tensor_type_base);
+        MLDataType elt_type = tensor_type_base->GetElementType();
+        const auto byte_count_per_element = elt_type->Size();
+        cur_output_saving_str = GetActivationOutputDimParamString(output_index) + " * " +
+                                std::to_string(byte_count_per_element) + " * " +
+                                std::to_string(GetSaveRatio());
+      } else {
+        cur_output_saving_str = "0";
+      }
+
+      if (!saving_str.empty()) {
+        saving_str += " + ";
+      }
+
+      saving_str = "(" + cur_output_saving_str + ")";
+    }
+
+    ORT_ENFORCE(!saving_str.empty(), "saving_str should not be empty for node: ", node->OpType(), " ", node->Name());
+    return "(" + saving_str + ")";
+  }
+
  private:
   bool compromise_recompute_;
   InlinedVector<const Node*> nodes_in_topological_order_;
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 76943b954837b..853eab61b4bd6 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -243,7 +243,7 @@ def _get_session_config(self):
         # requires PRIORITY_BASED order to work properly. So we use PRIORITY_BASED order when recompute is enabled.
         session_options.execution_order = (
             onnxruntime.ExecutionOrder.PRIORITY_BASED
-            if self._runtime_options.memory_optimizer_config != ""
+            if self._runtime_options.memory_optimizer_is_enabled()
             else onnxruntime.ExecutionOrder.DEFAULT
         )
         # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index a93f6413b7ab4..bfa38efb349ae 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -399,3 +399,12 @@ def _override_from_env_vars(self):
 
         if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ:
             self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1
+
+    def memory_optimizer_is_enabled(self) -> bool:
+        """Check whether memory optimizer is enabled."""
+        if self.memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED:
+            return len(self.memory_optimizer_config) > 0
+        elif self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+            return True
+
+        return False
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 6fb42dd59b6a0..feca94ae27c13 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -2218,12 +2218,6 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2_GPU) {
       {3072, 768},
       {4096, 768},
       {8192, 768},
-      {16384, 768},
-      {32768, 768},
-      {65536, 768},
-      {131072, 768},
-      {250002, 768},
-      {500004, 768},
   };
 
   for (const auto& test_dim : test_dims) {
diff --git a/setup.py b/setup.py
index 685f0612e3762..e94165fdf9b05 100644
--- a/setup.py
+++ b/setup.py
@@ -451,6 +451,7 @@ def finalize_options(self):
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Operating System :: Microsoft :: Windows",
     "Operating System :: MacOS",
 ]
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c655100fbf475..3d0ec92a7bd23 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -606,7 +606,7 @@ def convert_arg_line_to_args(self, arg_line):
         "--use_acl",
         nargs="?",
         const="ACL_1905",
-        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"],
+        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2308"],
         help="Build with ACL for ARM architectures.",
     )
     parser.add_argument("--acl_home", help="Path to ACL home dir")
@@ -1031,6 +1031,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_ACL_1905=" + ("ON" if args.use_acl == "ACL_1905" else "OFF"),
         "-Donnxruntime_USE_ACL_1908=" + ("ON" if args.use_acl == "ACL_1908" else "OFF"),
         "-Donnxruntime_USE_ACL_2002=" + ("ON" if args.use_acl == "ACL_2002" else "OFF"),
+        "-Donnxruntime_USE_ACL_2308=" + ("ON" if args.use_acl == "ACL_2308" else "OFF"),
         "-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
         "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
         "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index badee79fd78b3..172a0dc1866ab 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -92,6 +92,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: |
         # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
         set +x
@@ -105,6 +108,10 @@ stages:
           echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]"
         fi
       name: Set_Release_Version_Suffix
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Debug
   dependsOn: Setup
@@ -116,7 +123,14 @@ stages:
       MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: echo $(MyVar)
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Download_Java_Tools
   dependsOn: []
@@ -126,6 +140,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: CmdLine@2
       displayName: Download Java Tools
       inputs:
@@ -141,6 +158,9 @@ stages:
       inputs:
         targetPath: '$(Agent.TempDirectory)/java-tools'
         artifact: 'onnxruntime-java-tools'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
 
 - template: templates/c-api-cpu.yml
   parameters:
@@ -525,6 +545,9 @@ stages:
       submodules: false
     - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
       submodules: false
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
 
     - script: |
         set -e -x
@@ -603,6 +626,10 @@ stages:
       inputs:
         targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
         artifactName: 'onnxruntime-linux-x64-gpu'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
@@ -619,6 +646,10 @@ stages:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
     - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
       submodules: false
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
     - script: dir $(Build.SourcesDirectory)
     - task: BatchScript@1
       displayName: 'setup env'
@@ -688,7 +719,9 @@ stages:
       inputs:
         artifactName: 'onnxruntime-win-x64-gpu'
         targetPath: '$(Build.ArtifactStagingDirectory)'
-
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
 
 - stage: NuGet_Packaging_GPU
   dependsOn:
@@ -1246,45 +1279,21 @@ stages:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool : 'onnxruntime-Win-CPU-2022'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-win-dml-arm-zip'
-    StageName: 'Windows_CI_GPU_DML_Dev_arm'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x64'
-    EnvSetupScript: 'setup_env.bat'
-    sln_platform: 'arm'
-    DoDebugBuild: 'false'
-    DoNugetPack : 'true'
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    RunTests: 'false'
-    BuildNodejs: 'false'
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm.zip
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm.zip $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
 - stage: NuGet_Packaging_DML
   dependsOn:
   - Windows_CI_GPU_DML_Dev
   - Windows_CI_GPU_DML_Dev_x86
   - Windows_CI_GPU_DML_Dev_arm64
-  - Windows_CI_GPU_DML_Dev_arm
   condition: succeeded()
   jobs:
   - job:
     workspace:
       clean: all
-    pool: 'onnxruntime-Win2022-GPU-T4'
-
+    pool: 'onnxruntime-Win2022-GPU-dml-A10'
     steps:
-
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet DirectML'
       inputs:
@@ -1303,12 +1312,6 @@ stages:
         artifactName: 'drop-win-dml-arm64-zip'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-      inputs:
-        artifactName: 'drop-win-dml-arm-zip'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
     - script: |
         pushd $(Build.BinariesDirectory)\nuget-artifact-dml
         dir
@@ -1339,13 +1342,6 @@ stages:
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb
 
-                unzip win-dml-arm.zip -d win-arm
-                mkdir %%~ni\runtimes\win-arm
-                mkdir %%~ni\runtimes\win-arm\native
-
-                move win-arm\runtimes\win-arm\native\onnxruntime.dll %%~ni\runtimes\win-arm\native\onnxruntime.dll
-                move win-arm\runtimes\win-arm\native\onnxruntime.lib %%~ni\runtimes\win-arm\native\onnxruntime.lib
-                move win-arm\runtimes\win-arm\native\onnxruntime.pdb %%~ni\runtimes\win-arm\native\onnxruntime.pdb
 
                 pushd %%~ni
                 zip -r ..\%%~ni.zip .
@@ -1368,7 +1364,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,win-arm64,win-arm'
+        PlatformsSupported: 'win-x64,win-x86,win-arm64'
         VerifyNugetSigning: ${{ parameters.DoEsrp }}
 
     - task: PublishPipelineArtifact@0
@@ -1376,3 +1372,6 @@ stages:
       inputs:
         artifactName: 'drop-signed-nuget-dml'
         targetPath: '$(Build.ArtifactStagingDirectory)'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 9755e1f0771ba..07b233590bcf5 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -13,7 +13,7 @@ stages:
 
   jobs:
     - job: Linux_Training_CPU_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
       pool: onnxruntime-Ubuntu2004-AMD-CPU
@@ -28,6 +28,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
 
       steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index d9aff36c4ad34..f6fcbd08ff03a 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -9,11 +9,6 @@ parameters:
   type: string
   default: qnn-v2.17.0.231124_win
 
-- name: ort_package_version
-  displayName: OnnxRuntime Nuget package version
-  type: string
-  default: 1.15.0
-  
 - name: build_config
   displayName: Build Configuration
   type: string
@@ -47,7 +42,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -90,7 +85,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package x64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
             cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\x64
@@ -130,7 +125,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'
@@ -178,7 +173,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package arm64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
             cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\arm64
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index f3d68957d649c..e6d8ee35e75e3 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -92,6 +92,14 @@ stages:
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
 
+      - template: ../templates/py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_NAME: gpu
+          CudaVersion: ${{ parameters.cuda_version }}
+
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 37b4bdc43afcd..e6025ae1b56bd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -161,20 +161,6 @@ stages:
     buildJava: false
     buildNodejs: false
 
-- template: win-ci.yml
-  parameters:
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: CPU_arm_${{ parameters.BuildVariant }}
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm ${{ parameters.AdditionalBuildFlags }}  ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: onnxruntime-Win-CPU-2022
-
 - template: win-ci.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
@@ -205,10 +191,7 @@ stages:
   dependsOn:
   - Linux_C_API_Packaging_CPU
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Download_Java_Tools
   condition: succeeded()
   jobs:
@@ -297,7 +280,6 @@ stages:
   - MacOS_C_API_Package_Publish
   - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Full
   - iOS_Full_xcframework
@@ -340,14 +322,6 @@ stages:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
-    - template: flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download win-arm Pipeline Artifact'
-        ArtifactName: 'onnxruntime-win-arm'
-        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
     - template: flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download osx-x64 Pipeline Artifact'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index db3782c69cf62..2adcbb13dbeb8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -106,3 +106,7 @@ jobs:
       inputs:
         artifactName: 'drop-linux-cpu-${{ parameters.arch }}'
         targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
+
+    - template: component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index 8d5ca19a73535..0cb438c71066e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -30,6 +30,8 @@ jobs:
         PythonVersion: '3.10'
       Python311:
         PythonVersion: '3.11'
+      Python312:
+        PythonVersion: '3.12'
   steps:
   - checkout: none
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 44904f9248b10..a3c2983b755d0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -77,6 +77,10 @@ stages:
             PythonVersion: '3.11'
             MsbuildPlatform: x64
             buildArch: x64
+          Python312_x64:
+            PythonVersion: '3.12'
+            MsbuildPlatform: x64
+            buildArch: x64
           # Training build cannot support Win32 for now because one or more of its python
           # dependencies does not support Win32. So, don't build a training package for Win32
           ${{ if not(contains(parameters.build_py_parameters, '--enable_training')) }}:
@@ -96,13 +100,17 @@ stages:
               PythonVersion: '3.11'
               MsbuildPlatform: Win32
               buildArch: x86
+            Python312_x86:
+              PythonVersion: '3.12'
+              MsbuildPlatform: Win32
+              buildArch: x86
       variables:
         OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
         EnvSetupScript: setup_env.bat
         setVcvars: true
         BuildConfig: 'RelWithDebInfo'
         ExtraParam: ${{ parameters.build_py_parameters }}
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
 
@@ -295,6 +303,14 @@ stages:
           ENV_SETUP_SCRIPT: setup_env_gpu.bat
           EP_NAME: gpu
 
+      - template: py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          ENV_SETUP_SCRIPT: setup_env_gpu.bat
+          EP_NAME: gpu
+
       - template: py-win-gpu.yml
         parameters:
           MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
@@ -327,9 +343,17 @@ stages:
           ENV_SETUP_SCRIPT: setup_env.bat
           EP_NAME: directml
 
+      - template: py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+          ENV_SETUP_SCRIPT: setup_env.bat
+          EP_NAME: directml
+
   - ${{ if eq(parameters.enable_mac_cpu, true) }}:
     - job: MacOS_py_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
       pool:
@@ -346,6 +370,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
       steps:
       - checkout: self
         clean: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index 7fdd7e54e752d..e7b935712ac6c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -98,6 +98,12 @@ stages:
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
+          Python312:
+            PythonVersion: '3.12'
+            TorchVersion: ${{ parameters.torch_version }}
+            OpsetVersion: ${{ parameters.opset_version }}
+            CudaVersion: ${{ parameters.cuda_version }}
+            UploadWheel: ${{ parameters.upload_wheel }}
 
       steps:
       - task: CmdLine@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
index 110eaff46f460..1fe58a7239369 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
@@ -30,6 +30,10 @@ steps:
         variables = {
           "PythonManylinuxDir": "/opt/python/cp311-cp311"
         }
+      elif version == "3.12":
+        variables = {
+          "PythonManylinuxDir": "/opt/python/cp312-cp312"
+        }
       else:
         raise ValueError("Unsupported Python version: '{}'".format(version))
 
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index fdb9238071c9e..eee38ac04b355 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -59,15 +59,14 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
-        MachinePool: onnxruntime-Win2022-GPU-T4
+        MachinePool: onnxruntime-Win2022-GPU-A10
         isTraining: true
 
 - stage: dml
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 3c1c65c9a6862..4c0a39fdc512e 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -9,7 +9,7 @@ EXTRA_ARG=""
 
 # Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
 # config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp38-cp38/bin/python3.8")
+PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index af87852561e0a..546fca69201a1 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -116,6 +116,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -127,6 +131,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -140,6 +145,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index 8f265b208cd47..0c95083d614ed 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -131,6 +135,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 FROM runtime_base
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index b9fd88083f218..dd7c669c37885 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -135,6 +135,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
@@ -147,6 +151,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -160,6 +165,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 09ab7951552a0..a6a75afb0f4c3 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
index a36f60b87768d..d29157daef611 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
index 06e75ee1a39f6..66fe0cafd945b 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
@@ -114,6 +114,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/finalize-python.sh \
      /build_scripts/
@@ -122,6 +126,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -135,6 +140,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
index 7bf031ee78485..f576b867da73b 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e -x
 pushd .
-PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 CURRENT_DIR=$(pwd)
 if ! [ -x "$(command -v protoc)" ]; then
   $CURRENT_DIR/install_protobuf.sh
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
index aa0ad05b42dbf..7249fd2331321 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
@@ -1,5 +1,6 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index 85d738d2167e1..6c71631368822 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -65,9 +65,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     conda update --all && \
     rm ~/miniconda.sh && conda clean -ya
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Create migraphx-ci environment
 ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
 ENV CONDA_DEFAULT_ENV migraphx-ci
diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
index 86585b75d43fe..1ac1d226deec6 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
@@ -46,6 +46,8 @@ elif [[ "$PYTHON_VER" = "3.10" && -d "/opt/python/cp310-cp310"  ]]; then
    PYTHON_EXE="/opt/python/cp310-cp310/bin/python3.10"
 elif [[ "$PYTHON_VER" = "3.11" && -d "/opt/python/cp311-cp311"  ]]; then
    PYTHON_EXE="/opt/python/cp311-cp311/bin/python3.11"
+elif [[ "$PYTHON_VER" = "3.12" && -d "/opt/python/cp312-cp312"  ]]; then
+   PYTHON_EXE="/opt/python/cp312-cp312/bin/python3.12"
 else
    PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
index 8c79918120d8d..5b181a484a607 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
@@ -19,7 +19,7 @@ PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)"
 source "$PARENT_DIR/install_dotnet.sh"
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
index ad3366b0bb3b6..d8d2fbc06a00b 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
@@ -6,7 +6,7 @@ yum -y install \
     graphviz
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index d6912bfb05efe..94f52f476579b 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -1,5 +1,6 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 0fc80b30c1b3a..58a342277fc2d 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -1,6 +1,7 @@
 cerberus
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools==69.0.3
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
index 9c52aff960d6e..57331d6df97d9 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
@@ -1,2 +1,3 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
\ No newline at end of file
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
index 0cd5e5c5d5c46..01fa7b0ff956e 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
@@ -1,5 +1,5 @@
 scikit-learn
 packaging==21.3
-transformers==v4.30.0
-accelerate==0.20.1
+transformers==v4.36.0
+accelerate==0.25.0
 wget
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
index b4b265f65b69f..47f64568f424a 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@@ -1,9 +1,10 @@
 pandas
 scikit-learn
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
-transformers==v4.30.0
-accelerate
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
+transformers==v4.36.0
+accelerate==0.25.0
 rsa==4.9
 tensorboard==2.13.0
 h5py
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 29048b79d4b81..4db9df80ed187 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -67,9 +67,6 @@ ENV CONDA_DEFAULT_ENV rocm-ci
 RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
 ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Enable rocm-ci environment
 SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]
 
diff --git a/tools/ci_build/requirements.txt b/tools/ci_build/requirements.txt
index aaca45b3e17e1..57fc8f08336d2 100644
--- a/tools/ci_build/requirements.txt
+++ b/tools/ci_build/requirements.txt
@@ -1,7 +1,8 @@
 # packages used by transformers python unittest (only enabled in Linux CPU CI Pipeline)
 packaging
 protobuf==3.20.2
-numpy==1.24.0
+numpy==1.24.0 ; python_version < '3.12'
+numpy==1.26.0 ; python_version >= '3.12'
 coloredlogs==15.0
 transformers==4.36.0
 psutil