diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 6851c52d380ec..7144363717749 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Check whether the citation metadata from CITATION.cff is valid
         uses: citation-file-format/cffconvert-github-action@2.0.0
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2fe66013ebbbc..d3ecf44fe5733 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -33,7 +33,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 07346b38b2151..03ea773a25130 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -10,5 +10,5 @@ jobs:
     name: "Validation"
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: gradle/wrapper-validation-action@v1
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 91f9a8ee3df40..432c789e943b5 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,7 +12,7 @@ jobs:
     name: Optional Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: misspell # Check spellings as well
         uses: reviewdog/action-misspell@v1
         with:
@@ -34,7 +34,7 @@ jobs:
     name: Python format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
@@ -100,7 +100,7 @@ jobs:
     name: Lint JavaScript
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: reviewdog/action-eslint@v1
         with:
           reporter: github-pr-check
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index dceb15b446a8a..7b314d845d9b4 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -14,7 +14,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: ubuntu-latest
     steps:
-       - uses: actions/checkout@v3
+       - uses: actions/checkout@v4
          with:
            submodules: true
        - uses: actions/setup-python@v4
diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 2fbd8e521aeee..0a3e9ed2594c1 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -24,19 +24,19 @@ jobs:
     name: Generate C/C++ API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install doxygen and dependencies
         run: |
           sudo apt update
           sudo apt-get install libclang-dev
           sudo apt-get install libclang-cpp14
-          wget https://www.doxygen.nl/files/doxygen-1.9.6.linux.bin.tar.gz
-          tar xvzf doxygen-1.9.6.linux.bin.tar.gz
+          wget https://www.doxygen.nl/files/doxygen-1.9.8.linux.bin.tar.gz
+          tar xvzf doxygen-1.9.8.linux.bin.tar.gz
       - name: Run doxygen
         run: |
           mkdir -p build/doxygen
           cd docs/c_cxx
-          ../../doxygen-1.9.6/bin/doxygen
+          ../../doxygen-1.9.8/bin/doxygen
       - name: Log source commit
         run: git rev-parse --short HEAD > build/doxygen/html/source-version.txt
       - name: Move C/C++ docs into site
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 097d4a1cdff5e..9b9ca924bd008 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     env:
       DOCFXVERSION: 2.62.2
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Setup .NET
       uses: actions/setup-dotnet@v3
       with:
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index cea350ba54de0..9ea9bda7e7c53 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Java docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up JDK 11
         uses: actions/setup-java@v3
         with:
diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
index 5668be77c98a4..ba8bfd718abfa 100644
--- a/.github/workflows/publish-js-apidocs.yml
+++ b/.github/workflows/publish-js-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate JS API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Node.js
         uses: actions/setup-node@v3
         with:
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index b966793cc0d06..1b327eebfa8a8 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -23,7 +23,7 @@ jobs:
     name: Generate Objective-C API docs
     runs-on: macos-13
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install Jazzy
       run: |
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index 4ca1249fc1d8e..ab9d4781afb83 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -24,7 +24,7 @@ jobs:
     name: Generate Python API docs
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install tools
         run: |
           sudo apt-get update
diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
deleted file mode 100644
index 1416f5a4d33a9..0000000000000
--- a/.github/workflows/sca.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-name: Windows_SCA
-on:
-  push:
-    branches:
-      - main
-      - rel-*
-  pull_request:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  AZCOPY_AUTO_LOGIN_TYPE: MSI
-  AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-
-jobs:
-  Onnxruntime-SCA-training-CUDA:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Download cuda
-        run: azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" cuda_sdk
-
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v11.8 --enable_cuda_profiling  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA
-
-  # No python
-  Onnxruntime-SCA-win32-WINML-x64:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X64
-
-  # No java, No python
-  Onnxruntime-SCA-win32-WINML-x86:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: false
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11.x'
-          architecture: 'x86'
-
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x86 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v2
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X86
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8cd62db77b744..ba24e7eebfb03 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -18,7 +18,7 @@ jobs:
   Windows-CUDA-12:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: false
       - uses: actions/setup-python@v4
@@ -46,7 +46,7 @@ jobs:
   Onnxruntime-TVM:
     runs-on: windows-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - uses: actions/setup-python@v4
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 2a100ac161b97..e1671bcf43ed9 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -43,8 +43,8 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   FetchContent_Declare(
     googletest
     URL ${DEP_URL_googletest}
+    FIND_PACKAGE_ARGS 1.14.0...<2.0.0 NAMES GTest
     URL_HASH SHA1=${DEP_SHA1_googletest}
-    OVERRIDE_FIND_PACKAGE
   )
 endif()
 
diff --git a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
index 7296f2f30f286..37bdbf9fb53f6 100644
--- a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+++ b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
@@ -1,8 +1,8 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d53c48aa1..4c987bd7a 100755
+index d53c48aa1..77c3cf983 100755
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -105,7 +105,7 @@ ENDIF()
+@@ -105,22 +105,12 @@ ENDIF()
  
  IF(NOT CMAKE_SYSTEM_NAME)
    MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined")
@@ -11,7 +11,22 @@ index d53c48aa1..4c987bd7a 100755
    MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME = ${CMAKE_SYSTEM_NAME}")
  ENDIF()
  
-@@ -7108,6 +7108,10 @@ IF(MSVC)
+ # ---[ Download deps
+ IF(NOT XNNPACK_USE_SYSTEM_LIBS)
+-  IF(NOT DEFINED CLOG_SOURCE_DIR)
+-    MESSAGE(STATUS "Downloading clog to ${CMAKE_BINARY_DIR}/clog-source (define CLOG_SOURCE_DIR to avoid it)")
+-    CONFIGURE_FILE(cmake/DownloadCLog.cmake "${CMAKE_BINARY_DIR}/clog-download/CMakeLists.txt")
+-    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+-      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
+-    EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build .
+-      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download")
+-    SET(CLOG_SOURCE_DIR "${CMAKE_BINARY_DIR}/clog-source" CACHE STRING "clog source directory")
+-  ENDIF()
+-
+   IF(NOT DEFINED CPUINFO_SOURCE_DIR)
+     MESSAGE(STATUS "Downloading cpuinfo to ${CMAKE_BINARY_DIR}/cpuinfo-source (define CPUINFO_SOURCE_DIR to avoid it)")
+     CONFIGURE_FILE(cmake/DownloadCpuinfo.cmake "${CMAKE_BINARY_DIR}/cpuinfo-download/CMakeLists.txt")
+@@ -7108,6 +7098,10 @@ IF(MSVC)
    SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
    SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O2 >")
    SET_PROPERTY(SOURCE ${COLD_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: /O1 >")
@@ -22,3 +37,30 @@ index d53c48aa1..4c987bd7a 100755
  ELSE()
    SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
    SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$<NOT:$<CONFIG:Debug>>: -O2 >")
+@@ -7142,26 +7136,6 @@ IF(LIBM)
+   TARGET_LINK_LIBRARIES(indirection PRIVATE ${LIBM})
+ ENDIF()
+ 
+-# ---[ Configure clog
+-IF(NOT TARGET clog)
+-  IF(NOT XNNPACK_USE_SYSTEM_LIBS)
+-    SET(CLOG_BUILD_TESTS OFF CACHE BOOL "")
+-    SET(CLOG_RUNTIME_TYPE "${CPUINFO_RUNTIME_TYPE}" CACHE STRING "")
+-    ADD_SUBDIRECTORY(
+-      "${CLOG_SOURCE_DIR}/deps/clog"
+-      "${CMAKE_BINARY_DIR}/clog")
+-    # We build static version of clog but a dynamic library may indirectly depend on it
+-    SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON)
+-  ELSE()
+-    ADD_LIBRARY(clog STATIC IMPORTED)
+-    FIND_LIBRARY(CLOG_LIBRARY clog)
+-    IF(NOT CLOG_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find clog")
+-    ENDIF()
+-    SET_PROPERTY(TARGET clog PROPERTY IMPORTED_LOCATION "${CLOG_LIBRARY}")
+-  ENDIF()
+-ENDIF()
+-
+ # ---[ Configure cpuinfo
+ IF(NOT TARGET cpuinfo)
+   IF(NOT XNNPACK_USE_SYSTEM_LIBS)
diff --git a/docs/c_cxx/Doxyfile b/docs/c_cxx/Doxyfile
index 94b39d2045f69..aedb1fdcfee75 100644
--- a/docs/c_cxx/Doxyfile
+++ b/docs/c_cxx/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.9.2
+# Doxyfile 1.9.8
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -12,6 +12,16 @@
 # For lists, items can also be appended using:
 # TAG += value [value, ...]
 # Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
 
 #---------------------------------------------------------------------------
 # Project related configuration options
@@ -60,16 +70,28 @@ PROJECT_LOGO           = "../images/ONNX_Runtime_logo - Docs.png"
 
 OUTPUT_DIRECTORY       = ../../build/doxygen
 
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
 # putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
 # The default value is: NO.
 
 CREATE_SUBDIRS         = NO
 
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
 # characters to appear in the names of generated files. If set to NO, non-ASCII
 # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
@@ -81,14 +103,14 @@ ALLOW_UNICODE_NAMES    = NO
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
 # information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
 # The default value is: English.
 
 OUTPUT_LANGUAGE        = English
@@ -341,6 +363,17 @@ MARKDOWN_SUPPORT       = YES
 
 TOC_INCLUDE_HEADINGS   = 5
 
+# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to
+# generate identifiers for the Markdown headings. Note: Every identifier is
+# unique.
+# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a
+# sequence number starting at 0 and GITHUB use the lower case version of title
+# with any whitespace replaced by '-' and punctuation characters removed.
+# The default value is: DOXYGEN.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+MARKDOWN_ID_STYLE      = DOXYGEN
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -437,7 +470,7 @@ INLINE_SIMPLE_STRUCTS  = NO
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 # The default value is: NO.
 
-TYPEDEF_HIDES_STRUCT   = YES
+TYPEDEF_HIDES_STRUCT   = NO
 
 # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
 # cache is used to resolve symbols given their name and scope. Since this can be
@@ -452,7 +485,7 @@ TYPEDEF_HIDES_STRUCT   = YES
 
 LOOKUP_CACHE_SIZE      = 0
 
-# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
 # during processing. When set to 0 doxygen will based this on the number of
 # cores available in the system. You can set it explicitly to a value larger
 # than 0 to get more control over the balance between CPU load and processing
@@ -465,6 +498,14 @@ LOOKUP_CACHE_SIZE      = 0
 
 NUM_PROC_THREADS       = 1
 
+# If the TIMESTAMP tag is set different from NO then each generated page will
+# contain the date or date and time when the page was generated. Setting this to
+# NO can help when comparing the output of multiple runs.
+# Possible values are: YES, NO, DATETIME and DATE.
+# The default value is: NO.
+
+TIMESTAMP              = NO
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -546,7 +587,8 @@ HIDE_UNDOC_MEMBERS     = NO
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
 # to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
@@ -577,14 +619,15 @@ INTERNAL_DOCS          = NO
 # filesystem is case sensitive (i.e. it supports files in the same directory
 # whose names only differ in casing), the option must be set to YES to properly
 # deal with such files in case they appear in the input. For filesystems that
-# are not case sensitive the option should be be set to NO to properly deal with
+# are not case sensitive the option should be set to NO to properly deal with
 # output files written for symbols that only differ in casing, such as for two
 # classes, one named CLASS and the other named Class, and to also support
 # references to files without having to specify the exact matching casing. On
 # Windows (including Cygwin) and MacOS, users should typically set this option
 # to NO, whereas on Linux or other Unix flavors it should typically be set to
 # YES.
-# The default value is: system dependent.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
 
 CASE_SENSE_NAMES       = NO
 
@@ -836,11 +879,26 @@ WARN_IF_INCOMPLETE_DOC = YES
 
 WARN_NO_PARAMDOC       = YES
 
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
 # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
 # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
 # at the end of the doxygen process doxygen will return with a non-zero status.
-# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves
+# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not
+# write the warning messages in between other messages but write them at the end
+# of a run, in case a WARN_LOGFILE is defined the warning messages will be
+# besides being in the defined file also be shown at the end of a run, unless
+# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case
+# the behavior will remain as with the setting FAIL_ON_WARNINGS.
+# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT.
 # The default value is: NO.
 
 WARN_AS_ERROR          = YES
@@ -851,13 +909,27 @@ WARN_AS_ERROR          = YES
 # and the warning text. Optionally the format may contain $version, which will
 # be replaced by the version of the file (if it could be obtained via
 # FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
 # The default value is: $file:$line: $text.
 
 WARN_FORMAT            = "$file:$line: $text"
 
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -881,10 +953,21 @@ INPUT                  = ../../include/onnxruntime/core/session/onnxruntime_c_ap
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
 # documentation (see:
 # https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
 # *.h) to filter out the source-files in the directories.
@@ -896,18 +979,21 @@ INPUT_ENCODING         = UTF-8
 # Note the list of default checked file patterns might differ from the list of
 # default file extension mappings.
 #
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
-# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
-# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm,
+# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl,
+# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php,
+# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be
+# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
                          *.cxx \
+                         *.cxxm \
                          *.cpp \
+                         *.cppm \
                          *.c++ \
+                         *.c++m \
                          *.java \
                          *.ii \
                          *.ixx \
@@ -922,6 +1008,8 @@ FILE_PATTERNS          = *.c \
                          *.hxx \
                          *.hpp \
                          *.h++ \
+                         *.ixx \
+                         *.l \
                          *.cs \
                          *.d \
                          *.php \
@@ -984,10 +1072,7 @@ EXCLUDE_PATTERNS       =
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
+# ANamespace::AClass, ANamespace::*Test
 
 EXCLUDE_SYMBOLS        =
 
@@ -1032,6 +1117,11 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 #
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
 # Note that for custom extensions or not directly supported extensions you also
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
@@ -1073,6 +1163,15 @@ FILTER_SOURCE_PATTERNS =
 
 USE_MDFILE_AS_MAINPAGE =
 
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
 #---------------------------------------------------------------------------
@@ -1210,10 +1309,11 @@ CLANG_DATABASE_PATH    =
 
 ALPHABETICAL_INDEX     = YES
 
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
 IGNORE_PREFIX          =
@@ -1292,7 +1392,12 @@ HTML_STYLESHEET        =
 # Doxygen will copy the style sheet files to the output directory.
 # Note: The order of the extra style sheet files is of importance (e.g. the last
 # style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_EXTRA_STYLESHEET  =
@@ -1307,6 +1412,19 @@ HTML_EXTRA_STYLESHEET  =
 
 HTML_EXTRA_FILES       =
 
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = AUTO_LIGHT
+
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a color-wheel, see
@@ -1337,15 +1455,6 @@ HTML_COLORSTYLE_SAT    = 100
 
 HTML_COLORSTYLE_GAMMA  = 80
 
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
 # documentation will contain a main index with vertical navigation menus that
 # are dynamically created via JavaScript. If disabled, the navigation index will
@@ -1365,6 +1474,13 @@ HTML_DYNAMIC_MENUS     = YES
 
 HTML_DYNAMIC_SECTIONS  = NO
 
+# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be
+# dynamically folded and expanded in the generated HTML source code.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_CODE_FOLDING      = YES
+
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
 # shown in the various tree structured indices initially; the user can expand
 # and collapse entries dynamically later on. Doxygen will expand the tree to
@@ -1401,6 +1517,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1488,6 +1611,16 @@ BINARY_TOC             = NO
 
 TOC_EXPAND             = NO
 
+# The SITEMAP_URL tag is used to specify the full URL of the place where the
+# generated documentation will be placed on the server by the user during the
+# deployment of the documentation. The generated sitemap is called sitemap.xml
+# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL
+# is specified no sitemap is generated. For information about the sitemap
+# protocol see https://www.sitemaps.org
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SITEMAP_URL            =
+
 # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
 # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
 # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
@@ -1605,7 +1738,7 @@ GENERATE_TREEVIEW      = YES
 # area (value NO) or if it should extend to the full height of the window (value
 # YES). Setting this to YES gives a layout similar to
 # https://docs.readthedocs.io with more room for contents, but less room for the
-# project logo, title, and description. If either GENERATOR_TREEVIEW or
+# project logo, title, and description. If either GENERATE_TREEVIEW or
 # DISABLE_INDEX is set to NO, this option has no effect.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1636,6 +1769,13 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
 # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
 # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
 # https://inkscape.org) to generate formulas as SVG images instead of PNGs for
@@ -1969,9 +2109,16 @@ PDF_HYPERLINKS         = YES
 
 USE_PDFLATEX           = YES
 
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help.
+# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error.
+# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch
+# mode nothing is printed on the terminal, errors are scrolled as if <return> is
+# hit at every error; missing files that TeX tries to input or request from
+# keyboard input (\read on a not open input stream) cause the job to abort,
+# NON_STOP In nonstop mode the diagnostic message will appear on the terminal,
+# but there is no possibility of user interaction just like in batch mode,
+# SCROLL In scroll mode, TeX will stop only for missing files to input or if
+# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at
+# each error, asking for user intervention.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1992,14 +2139,6 @@ LATEX_HIDE_INDICES     = NO
 
 LATEX_BIB_STYLE        = plain
 
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
 # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
 # path from which the emoji images will be read. If a relative path is entered,
 # it will be relative to the LATEX_OUTPUT directory. If left blank the
@@ -2165,7 +2304,7 @@ DOCBOOK_OUTPUT         = docbook
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures
 # the structure of the code including all documentation. Note that this feature
 # is still experimental and incomplete at the moment.
 # The default value is: NO.
@@ -2176,6 +2315,28 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to Sqlite3 output
 #---------------------------------------------------------------------------
 
+# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3
+# database with symbols found by doxygen stored in tables.
+# The default value is: NO.
+
+GENERATE_SQLITE3       = NO
+
+# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be
+# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put
+# in front of it.
+# The default directory is: sqlite3.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_OUTPUT         = sqlite3
+
+# The SQLITE3_OVERWRITE_DB tag is set to YES, the existing doxygen_sqlite3.db
+# database file will be recreated with each doxygen run. If set to NO, doxygen
+# will warn if an a database file is already found and not modify it.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_SQLITE3 is set to YES.
+
+SQLITE3_RECREATE_DB    = YES
+
 #---------------------------------------------------------------------------
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
@@ -2250,7 +2411,8 @@ SEARCH_INCLUDES        = YES
 
 # The INCLUDE_PATH tag can be used to specify one or more directories that
 # contain include files that are not input files but should be processed by the
-# preprocessor.
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
 # onnxruntime-training and onnxruntime core headers are in different directories.
@@ -2324,15 +2486,15 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
+# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces
+# will be listed in the class and namespace index. If set to NO, only the
+# inherited external classes will be listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
 # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
+# in the topic index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
@@ -2346,16 +2508,9 @@ EXTERNAL_GROUPS        = YES
 EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
-# Configuration options related to the dot tool
+# Configuration options related to diagram generator tools
 #---------------------------------------------------------------------------
 
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
@@ -2364,7 +2519,7 @@ HIDE_UNDOC_RELATIONS   = YES
 
 # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
 # available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
 # The default value is: NO.
@@ -2381,32 +2536,73 @@ HAVE_DOT               = NO
 
 DOT_NUM_THREADS        = 0
 
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will
+# generate a graph for each documented class showing the direct and indirect
+# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and
+# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case
+# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the
+# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used.
+# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance
+# relations will be shown as texts / links.
+# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 CLASS_GRAPH            = YES
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
 # dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
+# class with other documented classes. Explicit enabling a collaboration graph,
+# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the
+# command \collaborationgraph. Disabling a collaboration graph can be
+# accomplished by means of the command \hidecollaborationgraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 COLLABORATION_GRAPH    = YES
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
+# groups, showing the direct groups dependencies. Explicit enabling a group
+# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means
+# of the command \groupgraph. Disabling a directory graph can be accomplished by
+# means of the command \hidegroupgraph. See also the chapter Grouping in the
+# manual.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2466,7 +2662,9 @@ TEMPLATE_RELATIONS     = NO
 # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
 # YES then doxygen will generate a graph for each documented file showing the
 # direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO,
+# can be accomplished by means of the command \includegraph. Disabling an
+# include graph can be accomplished by means of the command \hideincludegraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2475,7 +2673,10 @@ INCLUDE_GRAPH          = YES
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
 # the direct and indirect include dependencies of the file with other documented
-# files.
+# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set
+# to NO, can be accomplished by means of the command \includedbygraph. Disabling
+# an included by graph can be accomplished by means of the command
+# \hideincludedbygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2515,16 +2716,26 @@ GRAPHICAL_HIERARCHY    = YES
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
 # dependency relations are determined by the #include relations between the
-# files in the directories.
+# files in the directories. Explicit enabling a directory graph, when
+# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command
+# \directorygraph. Disabling a directory graph can be accomplished by means of
+# the command \hidedirectorygraph.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 DIRECTORY_GRAPH        = YES
 
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
 # output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
+# https://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
@@ -2561,11 +2772,12 @@ DOT_PATH               =
 
 DOTFILE_DIRS           =
 
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
 
-MSCFILE_DIRS           =
+DIA_PATH               =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
@@ -2574,10 +2786,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2627,6 +2839,8 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2640,3 +2854,19 @@ GENERATE_LEGEND        = YES
 # The default value is: YES.
 
 DOT_CLEANUP            = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will
+# use a built-in version of mscgen tool to produce the charts. Alternatively,
+# the MSCGEN_TOOL tag can also specify the name an external tool. For instance,
+# specifying prog as the value, doxygen will call the tool as prog -T
+# <outfile_format> -o <outputfile> <inputfile>. The external tool should support
+# output file formats "png", "eps", "svg", and "ismap".
+
+MSCGEN_TOOL            =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
diff --git a/docs/c_cxx/doxygen-header.html b/docs/c_cxx/doxygen-header.html
index 364f76f7f0580..6d95bf57ff98f 100644
--- a/docs/c_cxx/doxygen-header.html
+++ b/docs/c_cxx/doxygen-header.html
@@ -16,7 +16,7 @@
 <!--END DISABLE_INDEX-->
 <script type="text/javascript" src="$relpath^jquery.js"></script>
 <script type="text/javascript" src="$relpath^dynsections.js"></script>
-<script async src="https://www.googletagmanager.com/gtag/js?id=UA-156955408-1"></script><script type="text/javascript">"use strict"; window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-156955408-1'); </script> <script type="text/javascript" src="/assets/js/vendor/lunr.min.js"></script> <script type="text/javascript" src="/assets/js/just-the-docs.js"></script>
+<script async src="https://www.googletagmanager.com/gtag/js?id=UA-156955408-1"></script><script type="text/javascript">"use strict"; window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-156955408-1'); </script>
 $treeview
 $search
 $mathjax
diff --git a/include/onnxruntime/core/common/status.h b/include/onnxruntime/core/common/status.h
index d6e1992944feb..8f171daabbb1e 100644
--- a/include/onnxruntime/core/common/status.h
+++ b/include/onnxruntime/core/common/status.h
@@ -19,7 +19,6 @@ limitations under the License.
 #ifdef _WIN32
 #include <winerror.h>
 #endif
-#include "core/common/gsl.h"
 namespace onnxruntime {
 namespace common {
 
@@ -121,10 +120,8 @@ class [[nodiscard]] Status {
 
   Status(StatusCategory category, int code);
 
-  GSL_SUPPRESS(r.11)
   Status(const Status& other)
       : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
-  GSL_SUPPRESS(r.11)
   Status& operator=(const Status& other) {
     if (state_ != other.state_) {
       if (other.state_ == nullptr) {
diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index ec030084c9675..71a5912df2464 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -66,6 +66,13 @@ export declare namespace InferenceSession {
      */
     interOpNumThreads?: number;
 
+    /**
+     * The free dimension override.
+     *
+     * This setting is available only in ONNXRuntime (Node.js binding and react-native) or WebAssembly backend
+     */
+    freeDimensionOverrides?: {readonly [dimensionName: string]: number};
+
     /**
      * The optimization level.
      *
diff --git a/js/node/script/prepack.ts b/js/node/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/node/script/prepack.ts
+++ b/js/node/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }
diff --git a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
index 76fd608e4362b..72518488e6682 100644
--- a/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
+++ b/js/react_native/android/src/androidTest/java/ai/onnxruntime/reactnative/TensorHelperTest.java
@@ -238,6 +238,34 @@ public void createInputTensor_double() throws Exception {
     outputTensor.close();
   }
 
+  @Test
+  public void createInputTensor_bool() throws Exception {
+    OnnxTensor outputTensor = OnnxTensor.createTensor(ortEnvironment, new boolean[] {false, true});
+
+    JavaOnlyMap inputTensorMap = new JavaOnlyMap();
+
+    JavaOnlyArray dims = new JavaOnlyArray();
+    dims.pushInt(2);
+    inputTensorMap.putArray("dims", dims);
+
+    inputTensorMap.putString("type", TensorHelper.JsTensorTypeBool);
+
+    ByteBuffer dataByteBuffer = ByteBuffer.allocate(2);
+    dataByteBuffer.put((byte)0);
+    dataByteBuffer.put((byte)1);
+    inputTensorMap.putMap("data", blobModule.testCreateData(dataByteBuffer.array()));
+
+    OnnxTensor inputTensor = TensorHelper.createInputTensor(blobModule, inputTensorMap, ortEnvironment);
+
+    Assert.assertEquals(inputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(outputTensor.getInfo().onnxType, TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
+    Assert.assertEquals(inputTensor.toString(), outputTensor.toString());
+    Assert.assertArrayEquals(inputTensor.getByteBuffer().array(), outputTensor.getByteBuffer().array());
+
+    inputTensor.close();
+    outputTensor.close();
+  }
+
   @Test
   public void createOutputTensor_bool() throws Exception {
     MockitoSession mockSession = mockitoSession().mockStatic(Arguments.class).startMocking();
diff --git a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
index d9c2e3bac5d9b..63cddace36640 100644
--- a/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
+++ b/js/react_native/android/src/main/java/ai/onnxruntime/reactnative/TensorHelper.java
@@ -174,7 +174,11 @@ private static OnnxTensor createInputTensor(TensorInfo.OnnxTensorType tensorType
       tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.UINT8);
       break;
     }
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+      ByteBuffer buffer = values;
+      tensor = OnnxTensor.createTensor(ortEnvironment, buffer, dims, OnnxJavaType.BOOL);
+      break;
+    }
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
diff --git a/js/react_native/scripts/prepack.ts b/js/react_native/scripts/prepack.ts
index 15ae69722108c..2e43294165a83 100644
--- a/js/react_native/scripts/prepack.ts
+++ b/js/react_native/scripts/prepack.ts
@@ -18,7 +18,7 @@ function updatePackageJson() {
     delete packageSelf.dependencies['onnxruntime-common'];
   } else {
     const version = packageCommon.version;
-    packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+    packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   }
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index de53f943bc9ef..71d98f5d73671 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -59,6 +59,7 @@ Do not modify directly.*
 | Mul | ai.onnx(7-12,13,14+) |  |
 | Neg | ai.onnx(6-12,13+) |  |
 | Not | ai.onnx(1+) |  |
+| Pad | ai.onnx(2-10,11-12,13-17,18,19+) |  |
 | Pow | ai.onnx(7-11,12,13-14,15+) |  |
 | Reciprocal | ai.onnx(6-12,13+) |  |
 | ReduceL1 | ai.onnx(1-10,11-12,13-17,18+) |  |
diff --git a/js/web/karma.conf.js b/js/web/karma.conf.js
index 63c6f5bb045a7..35f782d1fdca3 100644
--- a/js/web/karma.conf.js
+++ b/js/web/karma.conf.js
@@ -3,10 +3,22 @@
 
 'use strict';
 
-const bundleMode = require('minimist')(process.argv)['bundle-mode'] || 'dev';  // 'dev'|'perf'|undefined;
-const karmaPlugins = require('minimist')(process.argv)['karma-plugins'] || undefined;
-const timeoutMocha = require('minimist')(process.argv)['timeout-mocha'] || 60000;
-const forceLocalHost = !!require('minimist')(process.argv)['force-localhost'];
+const args = require('minimist')(process.argv, {});
+const bundleMode = args['bundle-mode'] || 'dev';  // 'dev'|'perf'|undefined;
+const karmaPlugins = args['karma-plugins'] || undefined;
+const timeoutMocha = args['timeout-mocha'] || 60000;
+const forceLocalHost = !!args['force-localhost'];
+
+// parse chromium flags
+let chromiumFlags = args['chromium-flags'];
+if (!chromiumFlags) {
+  chromiumFlags = [];
+} else if (typeof chromiumFlags === 'string') {
+  chromiumFlags = [chromiumFlags];
+} else if (!Array.isArray(chromiumFlags)) {
+  throw new Error(`Invalid command line arg: --chromium-flags: ${chromiumFlags}`);
+}
+
 const commonFile = bundleMode === 'dev' ? '../common/dist/ort-common.js' : '../common/dist/ort-common.min.js'
 const mainFile = bundleMode === 'dev' ? 'test/ort.dev.js' : 'test/ort.perf.js';
 
@@ -91,37 +103,10 @@ module.exports = function(config) {
     listenAddress,
     customLaunchers: {
       // the following flags are used to make sure Edge on CI agents to initialize WebGPU correctly.
-      EdgeWebGpuTest: {base: 'Edge', flags: ['--ignore-gpu-blocklist', '--gpu-vendor-id=0x10de']},
-      ChromeTest: {base: 'Chrome', flags: ['--enable-features=SharedArrayBuffer']},
-      ChromeTestHeadless: {base: 'ChromeHeadless', flags: ['--enable-features=SharedArrayBuffer']},
-      ChromeDebug:
-          {debug: true, base: 'Chrome', flags: ['--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer']},
-      ChromeCanaryTest: {
-        base: 'ChromeCanary',
-        flags: ['--enable-features=SharedArrayBuffer', '--enable-experimental-web-platform-features']
-      },
-      ChromeCanaryDebug: {
-        debug: true,
-        base: 'ChromeCanary',
-        flags: [
-          '--remote-debugging-port=9333', '--enable-features=SharedArrayBuffer',
-          '--enable-experimental-web-platform-features'
-        ]
-      },
-      ChromeWebGpuProfileTest: {
-        base: 'Chrome',
-        flags:
-            ['--window-size=1,1', '--enable-features=SharedArrayBuffer', '--disable-dawn-features=disallow_unsafe_apis']
-      },
-      ChromeWebGpuProfileDebug: {
-        debug: true,
-        base: 'Chrome',
-        flags: [
-          '--remote-debugging-port=9333',
-          '--enable-features=SharedArrayBuffer',
-          '--disable-dawn-features=disallow_unsafe_apis',
-        ]
-      },
+      EdgeTest: {base: 'Edge', flags: chromiumFlags},
+      ChromeTest: {base: 'Chrome', flags: chromiumFlags},
+      ChromeTestHeadless: {base: 'ChromeHeadless', flags: chromiumFlags},
+      ChromeCanaryTest: {base: 'ChromeCanary', flags: chromiumFlags},
       //
       // ==== BrowserStack browsers ====
       //
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 7f0430b7b28b9..59da1369e152e 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -54,6 +54,7 @@ export interface OrtWasmModule extends EmscriptenModule {
       enableProfiling: boolean, profileFilePrefix: number, logId: number, logSeverityLevel: number,
       logVerbosityLevel: number, optimizedModelFilePath: number): number;
   _OrtAppendExecutionProvider(sessionOptionsHandle: number, name: number): number;
+  _OrtAddFreeDimensionOverride(sessionOptionsHandle: number, name: number, dim: number): number;
   _OrtAddSessionConfigEntry(sessionOptionsHandle: number, configKey: number, configValue: number): number;
   _OrtReleaseSessionOptions(sessionOptionsHandle: number): void;
 
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 653957a9a3489..5e77a0343b4ee 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -4,7 +4,7 @@
 import {Env} from 'onnxruntime-common';
 
 import {configureLogger, LOG_DEBUG} from './log';
-import {TensorView} from './tensor';
+import {TensorView} from './tensor-view';
 import {createGpuDataManager, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
@@ -110,6 +110,7 @@ export class WebGpuBackend {
     }
 
     this.env = env;
+    const requiredFeatures: GPUFeatureName[] = [];
     const deviceDescriptor: GPUDeviceDescriptor = {
       requiredLimits: {
         maxComputeWorkgroupStorageSize: adapter.limits.maxComputeWorkgroupStorageSize,
@@ -121,13 +122,16 @@ export class WebGpuBackend {
         maxComputeWorkgroupSizeY: adapter.limits.maxComputeWorkgroupSizeY,
         maxComputeWorkgroupSizeZ: adapter.limits.maxComputeWorkgroupSizeZ,
       },
+      requiredFeatures,
     };
     // WebGPU Spec: Timestamp Queries Inside Passes
     // https://github.com/gpuweb/gpuweb/blob/main/proposals/timestamp-query-inside-passes.md
     if (adapter.features.has('timestamp-query-inside-passes')) {
       this.supportTimestampQuery = true;
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      deviceDescriptor.requiredFeatures = ['timestamp-query-inside-passes' as any];
+      requiredFeatures.push('timestamp-query-inside-passes' as GPUFeatureName);
+    }
+    if (adapter.features.has('shader-f16')) {
+      requiredFeatures.push('shader-f16');
     }
 
     this.device = await adapter.requestDevice(deviceDescriptor);
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 24ff79cfad3ee..78316cbe1c825 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -8,7 +8,7 @@ import {DataType, getTensorElementSize} from '../wasm-common';
 
 import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
-import {TensorView} from './tensor';
+import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
 import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
 
diff --git a/js/web/lib/wasm/jsep/tensor-view.ts b/js/web/lib/wasm/jsep/tensor-view.ts
new file mode 100644
index 0000000000000..69b9287f6de29
--- /dev/null
+++ b/js/web/lib/wasm/jsep/tensor-view.ts
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {Tensor} from 'onnxruntime-common';
+
+import {tensorTypeToTypedArrayConstructor} from '../wasm-common';
+
+export const createView = (dataBuffer: ArrayBuffer, type: Tensor.Type): Int32Array|Uint32Array|BigInt64Array|
+    BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
+        new (tensorTypeToTypedArrayConstructor(type))(dataBuffer);
+
+/**
+ * a TensorView does not own the data.
+ */
+export interface TensorView {
+  readonly data: number;
+  readonly dataType: number;
+  readonly dims: readonly number[];
+
+  /**
+   * get a Float32Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getFloat32Array(): Float32Array;
+
+  /**
+   * get a BigInt64Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getBigInt64Array(): BigInt64Array;
+
+  /**
+   * get a Int32Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getInt32Array(): Int32Array;
+
+  /**
+   * create a new tensor view with the same data but different dimensions.
+   */
+  reshape(newDims: readonly number[]): TensorView;
+}
diff --git a/js/web/lib/wasm/jsep/tensor.ts b/js/web/lib/wasm/jsep/tensor.ts
deleted file mode 100644
index abe61e07fc0a8..0000000000000
--- a/js/web/lib/wasm/jsep/tensor.ts
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-export declare namespace Tensor {
-  export interface DataTypeMap {
-    bool: Uint8Array;
-    float32: Float32Array;
-    float64: Float64Array;
-    string: string[];
-    int8: Int8Array;
-    uint8: Uint8Array;
-    int16: Int16Array;
-    uint16: Uint16Array;
-    int32: Int32Array;
-    uint32: Uint32Array;
-    int64: BigInt64Array;
-    uint64: BigUint64Array;
-  }
-
-  export type DataType = keyof DataTypeMap;
-
-  export type StringType = Tensor.DataTypeMap['string'];
-  export type BooleanType = Tensor.DataTypeMap['bool'];
-  export type IntegerType = Tensor.DataTypeMap['int8']|Tensor.DataTypeMap['uint8']|Tensor.DataTypeMap['int16']|
-                            Tensor.DataTypeMap['uint16']|Tensor.DataTypeMap['int32']|Tensor.DataTypeMap['uint32']|
-                            Tensor.DataTypeMap['int64']|Tensor.DataTypeMap['uint64'];
-  export type FloatType = Tensor.DataTypeMap['float32']|Tensor.DataTypeMap['float64'];
-  export type NumberType = BooleanType|IntegerType|FloatType;
-
-  export type Id = number;
-}
-
-export const sizeof = (type: Tensor.DataType): number => {
-  switch (type) {
-    case 'bool':
-    case 'int8':
-    case 'uint8':
-      return 1;
-    case 'int16':
-    case 'uint16':
-      return 2;
-    case 'int32':
-    case 'uint32':
-    case 'float32':
-      return 4;
-    case 'int64':
-    case 'uint64':
-    case 'float64':
-      return 8;
-    default:
-      throw new Error(`cannot calculate sizeof() on type ${type}`);
-  }
-};
-
-const dataviewConstructor = (type: Tensor.DataType) => {
-  switch (type) {
-    case 'bool':
-    case 'uint8':
-      return Uint8Array;
-    case 'int8':
-      return Int8Array;
-    case 'int16':
-      return Int16Array;
-    case 'uint16':
-      return Uint16Array;
-    case 'int32':
-      return Int32Array;
-    case 'uint32':
-      return Uint32Array;
-    case 'int64':
-      return BigInt64Array;
-    case 'uint64':
-      return BigUint64Array;
-    case 'float32':
-      return Float32Array;
-    case 'float64':
-      return Float64Array;
-    default:
-      // should never run to here
-      throw new Error('unspecified error');
-  }
-};
-
-export const createView = (dataBuffer: ArrayBuffer, type: Tensor.DataType): Int32Array|Uint32Array|BigInt64Array|
-    BigUint64Array|Uint8Array|Float32Array|Float64Array|Int8Array|Int16Array|Uint16Array =>
-        new (dataviewConstructor(type))(dataBuffer);
-
-/**
- * a TensorView does not own the data.
- */
-export interface TensorView {
-  readonly data: number;
-  readonly dataType: number;
-  readonly dims: readonly number[];
-
-  /**
-   * get a Float32Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getFloat32Array(): Float32Array;
-
-  /**
-   * get a BigInt64Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getBigInt64Array(): BigInt64Array;
-
-  /**
-   * get a Int32Array data view of the tensor data. tensor data must be on CPU.
-   */
-  getInt32Array(): Int32Array;
-
-  /**
-   * create a new tensor view with the same data but different dimensions.
-   */
-  reshape(newDims: readonly number[]): TensorView;
-}
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 9c46b97694903..e92e6696d9a78 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -14,6 +14,7 @@ import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
 import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
+import {pad, parsePadAttributes} from './ops/pad';
 import * as pool from './ops/pool';
 import {parseReduceAttributes, reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
 import {parseResizeAttributes, resize} from './ops/resize';
@@ -80,6 +81,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Mul', [binaryOps.mul]],
   ['Neg', [unaryOps.neg]],
   ['Not', [unaryOps.not]],
+  ['Pad', [pad, parsePadAttributes]],
   ['Pow', [binaryOps.pow]],
   ['Reciprocal', [unaryOps.reciprocal]],
   ['ReduceMin', [reduceMin, parseReduceAttributes]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index f07f0bbb84ee6..80a80b4c18619 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -20,7 +20,7 @@
 // modified to fit the needs of the project
 
 import {LOG_DEBUG} from '../../../log';
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {ConvAttributes} from '../conv';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 82fe3d5b6af43..ec6df438129fb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -18,7 +18,7 @@
 // sampled from [@tensorflow/tfjs] tfjs-backend-webgpu/src/conv_backprop_webgpu.ts
 
 import {LOG_DEBUG} from '../../../log';
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {inputVariable, outputVariable, ShaderHelper} from '../common';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index e70226e55ee79..2d6067fdbfa49 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -19,7 +19,7 @@
 //
 // modified to fit the needs of the project
 
-import {TensorView} from '../../../tensor';
+import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
 import {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
index 12a13d9d8e0a0..412e61a3cc0f9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/argminmax.ts
@@ -6,7 +6,7 @@
 // a optimized codepath for this.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
index b004ca37a2ea8..13d3a91bb339e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/binary-op.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {BroadcastUtil, ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index c96f4858db2ae..c054da51a3098 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -192,11 +192,14 @@ export interface IndicesHelper {
 }
 
 const getWgslMappedType = (type: number, components: 1|2|3|4): string|[string, string] => {
+  if (components === 3) {
+    throw new Error('vec3 has same alignment as vec4, use vec4 instead');
+  }
+
   // return type is [ storage type, runtime type ] or a single string for both
   switch (type) {
-    // TODO: enable after "shader-f16" WSGL extension release
-    // case DataType.float16:
-    //   return components > 1 ? `vec${components}<f16>` : 'f16';
+    case DataType.float16:
+      return components > 1 ? `vec${components}<f16>` : 'f16';
     case DataType.float:
       return components > 1 ? `vec${components}<f32>` : 'f32';
     case DataType.int32:
@@ -589,7 +592,8 @@ class ShaderHelperImpl implements ShaderHelper {
     const workgroupSizeZ = typeof workgroupSize === 'number' ? 1 : workgroupSize[2];
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
-    const paramList = is1DimensionDispatch ? '@builtin(global_invocation_id) global_id : vec3<u32>' :
+    const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(local_invocation_id) local_id : vec3<u32>` :
                                              `@builtin(local_invocation_index) local_index : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
index 9b294803d3787..279632c190ded 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 8a794ce16a0b5..1b7b7e0b29a25 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index 7503f664dfc13..3274969970a91 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
 import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index f205d4a06b176..a9642d85ede8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
 import {PoolConvUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
index 0abece9559630..21c0b97042fbb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv2d-mm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {GpuDataType, ProgramInfoLoader, ProgramMetadata} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
index f0196f37c3153..fc9ebf004ad25 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/einsum.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 2d845775f1c62..824ce682c0c4b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
index 57c5fccfd8c26..a7d355bc13704 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather-elements.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index a915a4bbd969c..0db060dbec54a 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 3ce963b54f3ee..1a36d4a7545d6 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {GemmUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index f62c766aa9ed0..5a148bda0a9f7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -1,83 +1,97 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from './common';
 
 export interface InstanceNormAttributes extends AttributeWithCacheKey {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const validateInputs = (inputs: readonly TensorView[]): void => {
-  if (!inputs || inputs.length !== 3) {
-    throw new Error('instanceNorm requires 3 inputs.');
-  }
-
-  if (inputs[0].dataType !== DataType.float || inputs[1].dataType !== DataType.float) {
-    throw new Error('inputs should be float type');
-  }
-};
-
 const createInstanceNormProgramInfo =
     (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-      const scale = inputs[1];
-      const bias = inputs[2];
 
       const outputShape = xShape;
-      const outputSize = ShapeUtil.size(outputShape);
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const C = xShape[1];
-
-      const scaleSize = ShapeUtil.size(scale.dims);
-      const biasSize = bias ? ShapeUtil.size(bias.dims) : 0;
-      if (scaleSize !== normSize || (bias && biasSize !== normSize)) {
-        throw new Error(`Size of X.shape()[axis:] == ${normSize}.
-             Size of scale and bias (if provided) must match this. 
-             Got scale size of ${scaleSize} and bias size of ${biasSize}`);
-      }
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-
+      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normSize]);
+      const variables = [x, scale, bias, output];
+      const dataType = x.type.value;
+      const workgroupSize = 64;
       const getShaderSource = (shaderHelper: ShaderHelper) => `
+
   const C: u32 = ${C};
   const normSize: u32 = ${normSize};
-  const normSizeTyped: ${dataType} = ${normSize};
   const epsilon: f32 = ${attributes.epsilon};
+  var<workgroup> meanShared : ${dataType};
+  var<workgroup> squaredNormShared : ${dataType};
+  var<workgroup> workgroupShared : array<${dataType}, ${workgroupSize}>;
+  const workgroupSize = ${workgroupSize}u;
+  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.mainStart(workgroupSize)}
+    let norm = global_idx / workgroupSize;
+    let batch = norm / C;
+    let channel = norm % C;
+    let localIndex = local_id.x;
+
+    // initialize workgroup memory
+    var initial: ${dataType} = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      initial = initial + ${x.get('batch', 'channel', 'h')};
+    }
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-  @group(0) @binding(0) var<storage, read> x : array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> scale : array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias : array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> output : array<${dataType}>;
-
-  ${shaderHelper.mainStart()}
-    let offset = global_idx * normSize;
-    if (offset + normSize >= ${outputSize}) { return; }
-    var mean: ${dataType} = 0;
+    // Calculate the mean of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (localIndex == 0) {
+      meanShared = workgroupShared[0] / ${dataType}(normSize);
+    }
+    workgroupBarrier();
 
-    for (var h: u32 = 0u; h < normSize; h++) {
-        mean = mean + x[h + offset];
+    // reinitialize workgroup memory.
+    initial = 0;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let deviation =  ${x.get('batch', 'channel', 'h')} - meanShared;
+      initial = initial + deviation * deviation;
     }
-    mean = mean / normSizeTyped;
+    workgroupShared[localIndex] = initial;
+    workgroupBarrier();
 
-    var squaredNorm: ${dataType} = 0;
-    for (var h: u32 = 0u; h < normSize; h++) {
-        let deviation: f32 = x[h + offset] - mean;
-        squaredNorm = squaredNorm + deviation * deviation;
+    // Calculate the sum of square of deviation of current channel data.
+    for (var currSize = workgroupSize >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (localIndex < currSize) {
+        workgroupShared[localIndex] = workgroupShared[localIndex] + workgroupShared[localIndex + currSize];
+      }
+      workgroupBarrier();
     }
-    let invStdDev = 1 / sqrt(squaredNorm / normSizeTyped + epsilon);
-    let channelScale = invStdDev * scale[global_idx % C];
-    let channelShift = bias[global_idx % C] - mean * channelScale;
-    for (var j: u32 = 0; j < normSize; j++) {
-        output[j + offset] = x[j + offset] * channelScale + channelShift;
+    if (localIndex == 0) {
+      squaredNormShared = workgroupShared[0];
+    }
+    workgroupBarrier();
+
+    let invStdDev = 1 / sqrt(squaredNormShared / ${dataType}(normSize) + epsilon);
+    let channelScale = invStdDev * ${scale.getByOffset('channel')};
+    let channelShift = ${bias.getByOffset('channel')} - meanShared * channelScale;
+    for (var h = localIndex; h < normSize; h += workgroupSize) {
+      let value = ${x.get('batch', 'channel', 'h')} * channelScale + channelShift;
+      ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
       return {
@@ -86,7 +100,7 @@ const createInstanceNormProgramInfo =
           {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
         ],
         getShaderSource,
-        dispatchGroup: () => ({x: Math.ceil(normCount / 64 /* workgroup size */)})
+        dispatchGroup: () => ({x: normCount})
       };
     };
 
@@ -118,7 +132,7 @@ const createInstanceNormNHWCProgramInfo =
   ${shaderHelper.mainStart()}
     let currentImageNumber = global_idx / C;
     let currentChannelNumber = global_idx % C;
-    
+
     // offset is channel num * N
     let offset = currentImageNumber * imageSize;
     if (offset >= ${outputSize}) { return; }
@@ -156,8 +170,6 @@ export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes):
     createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
 
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
-  validateInputs(context.inputs);
-
   const metadata = {
     name: 'InstanceNormalization',
     inputTypes: [GpuDataType.default, GpuDataType.default, GpuDataType.default],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9927b25a52e..d6a79e9460c3f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index 8ed41bc09480d..bceaf244987c4 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -1,7 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
 import {BroadcastUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfoLoader} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
new file mode 100644
index 0000000000000..c2f89fd2845df
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -0,0 +1,252 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor-view';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
+
+import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+
+export interface PadAttributes extends AttributeWithCacheKey {
+  // 0-constant, 1-reflect, 2-edge, 3-wrap
+  readonly mode: number;
+  readonly value: number;
+  readonly pads: number[];
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length < 1) {
+    throw new Error('Too few inputs');
+  }
+  if (inputs[0].dataType !== DataType.float) {
+    throw new Error('Input type must be float.');
+  }
+
+  if (inputs.length >= 2) {
+    let validPads = inputs[0].dims.length * 2 === inputs[1].dims[0];
+    if (inputs.length === 4) {
+      validPads = inputs[3].dims[0] * 2 === inputs[1].dims[0];
+    }
+    if (!validPads) {
+      throw new Error('The pads should be a 1D tensor of shape [2 * input_rank] or [2 * num_axes].');
+    }
+  }
+};
+
+const getPadConstant =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[], dataType: string, constantValue: number): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+            k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+            if (k < 0) {
+              break;
+            }
+            if (k >= ${inputDims[i]}) {
+              break;
+            }
+            offset += k * ${inputStrides[i]};
+        `;
+      }
+
+      return `
+          value = ${dataType}(${constantValue});
+          for (var i = 0; i < 1; i++) {
+            var offset = 0;
+            var k = 0;
+            ${block}
+            value = x[offset];
+          }
+      `;
+    };
+
+const getPadReflect =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0) {
+                  k = -k;
+                }
+                {
+                  let _2n_1 = ${2 * (inputDims[i] - 1)};
+                  k = k % _2n_1;
+                  if(k >= ${inputDims[i]}) {
+                    k = _2n_1 - k;
+                  }
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadEdge =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0) {
+                  k = 0;
+                }
+                if (k >= ${inputDims[i]}) {
+                  k = ${inputDims[i] - 1};
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadWrap =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], pads: number[]): string => {
+      const inputRank = inputDims.length;
+
+      let block = '';
+      for (let i = inputRank - 1; i >= 0; --i) {
+        block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+                if (k < 0)  {
+                  k += ${inputDims[i]};
+                }
+                if (k >= ${inputDims[i]}) {
+                  k -= ${inputDims[i]};
+                }
+                offset += k * ${inputStrides[i]};
+            `;
+      }
+
+      return `
+              var offset = 0;
+              var k = 0;
+              ${block}
+              value = x[offset];
+          `;
+    };
+
+const getPadSnippet =
+    (output: IndicesHelper, outputDims: readonly number[], inputDims: readonly number[],
+     inputStrides: readonly number[], attributes: PadAttributes, dataType: string): string => {
+      switch (attributes.mode) {
+        case 0:
+          return getPadConstant(
+              output, outputDims, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
+        case 1:
+          return getPadReflect(output, outputDims, inputDims, inputStrides, attributes.pads);
+        case 2:
+          return getPadEdge(output, outputDims, inputDims, inputStrides, attributes.pads);
+        case 3:
+          return getPadWrap(output, outputDims, inputDims, inputStrides, attributes.pads);
+        default:
+          throw new Error('Invalid mode');
+      }
+    };
+
+const generatePadCode =
+    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
+        string => {
+          const inputDims = inputs[0].dims;
+          const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
+          const outputSize = ShapeUtil.size(outputDims);
+          const inputStrides = ShapeUtil.computeStrides(inputDims);
+
+          const output = outputVariable('output', inputs[0].dataType, outputDims);
+          const input = inputVariable('x', inputs[0].dataType, inputDims);
+
+          const padSnippet = getPadSnippet(output, outputDims, inputDims, inputStrides, attributes, dataType);
+          const padCode = `
+              ${shaderHelper.declareVariables(input, output)}
+              ${output.impl()}
+              ${shaderHelper.mainStart()}
+              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+
+              let indices = ${output.offsetToIndices('global_idx')};
+
+              var value = ${dataType}(0);
+              ${padSnippet}
+              output[global_idx] = value;
+          }`;
+          return padCode;
+        };
+
+const createPadProgramInfo =
+    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: PadAttributes): ProgramInfo => {
+      const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+      return {
+        ...metadata,
+        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+        getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+        dispatchGroup: () => ({x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)})
+      };
+    };
+
+const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes: PadAttributes): PadAttributes => {
+  if (inputs.length > 1) {
+    const bigInt64Pads = inputs[1].getBigInt64Array();
+    const value = (inputs.length >= 3) ? inputs[2].getFloat32Array()[0] : 0.0;
+
+    const inputRank = inputs[0].dims.length;
+    const updatePads = new Int32Array(2 * inputRank).fill(0);
+    if (inputs.length >= 4) {
+      const axes = inputs[3].getBigInt64Array();
+      for (let i = 0; i < axes.length; i++) {
+        updatePads[Number(axes[i])] = Number(bigInt64Pads[i]);
+        updatePads[Number(axes[i]) + inputRank] = Number(bigInt64Pads[i + axes.length]);
+      }
+    } else {
+      bigInt64Pads.forEach((i, v) => updatePads[Number(i)] = (Number(v)));
+    }
+
+    const pads: number[] = [];
+    updatePads.forEach(v => pads.push(v));
+
+    return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+  } else {
+    return attributes;
+  }
+};
+
+const createPadProgramInfoLoader = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfoLoader => {
+  const updatedAttributes = createPadAttributesFromInputs(inputs, attributes);
+  const metadata:
+      ProgramMetadata = {name: 'Pad', inputTypes: [GpuDataType.default], cacheHint: updatedAttributes.cacheKey};
+  return {...metadata, get: () => createPadProgramInfo(inputs, metadata, updatedAttributes)};
+};
+
+export const pad = (context: ComputeContext, attributes: PadAttributes): void => {
+  validateInputs(context.inputs);
+  context.compute(createPadProgramInfoLoader(context.inputs, attributes), {inputs: [0]});
+};
+
+export const parsePadAttributes = (attributes: Record<string, unknown>): PadAttributes => {
+  const mode = attributes.mode as number;
+  const value = attributes.value as number;
+  const pads = attributes.pads as number[];
+  return createAttributeWithCacheKey({mode, value, pads});
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
index 79071d32443d6..8c8c12fc54ddb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {PoolConvUtil, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
index cb592c838dd97..0b8d03ea73b6b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
index 1d0b8229a76f7..8b9dbbf57ac75 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
index 4b845bcf2121b..7bfdd73b8af18 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
index 4211e526898e6..257b9ebc1fdac 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/slice.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata, TensorInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
index e2443b24410a5..495a4bcea4f47 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/softmax.ts
@@ -5,7 +5,7 @@
 // performance limitations when the reduced axis is long. Need to add
 // a optimized codepath for this.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
index 9a150d21ea02e..3367091bbac23 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata, TensorInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
index 99d9668757caa..109c29bfc8a80 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/tile.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 9243b0e4af6b6..38dcaeab54c54 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
index ef63d1177768c..7e52954734216 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {DataType} from '../../../wasm-common';
-import {TensorView} from '../../tensor';
+import {TensorView} from '../../tensor-view';
 import {MAX_CLIP, MIN_CLIP, ShapeUtil} from '../../util';
 import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
 import {ComputeContext, GpuDataType, ProgramInfo, ProgramInfoLoader, ProgramMetadata} from '../types';
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index a02d2ebeebf78..cf2687e4c7382 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -4,7 +4,7 @@
 import {tensorDataTypeEnumToString} from '../../wasm-common';
 import {WebGpuBackend} from '../backend-webgpu';
 import {LOG_DEBUG} from '../log';
-import {TensorView} from '../tensor';
+import {TensorView} from '../tensor-view';
 
 import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
@@ -126,10 +126,13 @@ export class ProgramManager {
   }
   build(programInfo: ProgramInfo, normalizedDispatchGroupSize: [number, number, number]): Artifact {
     const device = this.backend.device;
-
+    const extensions: string[] = [];
+    if (device.features.has('shader-f16')) {
+      extensions.push('enable f16;');
+    }
     const shaderHelper = createShaderHelper(normalizedDispatchGroupSize);
     const userCode = programInfo.getShaderSource(shaderHelper);
-    const code = `${shaderHelper.additionalImplementations}\n${userCode}`;
+    const code = `${extensions.join('\n')}\n${shaderHelper.additionalImplementations}\n${userCode}`;
     const shaderModule = device.createShaderModule({code, label: programInfo.name});
     LOG_DEBUG('verbose', () => `[WebGPU] shader code: ${code}`);
 
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index ddbb9afc275f2..78f80b89774e2 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Tensor, TensorView} from '../tensor';
+import {TensorView} from '../tensor-view';
 
 import {ShaderHelper} from './ops/common';
 
@@ -19,7 +19,6 @@ export interface GpuData {
 }
 
 export interface TensorInfo {
-  id?: Tensor.Id;
   dims: readonly number[];
   dataType: number;
   gpuDataType: GpuDataType;
diff --git a/js/web/lib/wasm/session-handler.ts b/js/web/lib/wasm/session-handler.ts
index d35f295592685..d8c5ae7886fe4 100644
--- a/js/web/lib/wasm/session-handler.ts
+++ b/js/web/lib/wasm/session-handler.ts
@@ -9,6 +9,7 @@ import {SerializableModeldata} from './proxy-messages';
 import {createSession, createSessionAllocate, createSessionFinalize, endProfiling, initializeRuntime, releaseSession, run} from './proxy-wrapper';
 
 let runtimeInitialized: boolean;
+let runtimeInitializationPromise: Promise<void>|undefined;
 
 export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
   private sessionId: number;
@@ -29,7 +30,11 @@ export class OnnxruntimeWebAssemblySessionHandler implements SessionHandler {
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
     if (!runtimeInitialized) {
-      await initializeRuntime(env);
+      if (!runtimeInitializationPromise) {
+        runtimeInitializationPromise = initializeRuntime(env);
+      }
+      await runtimeInitializationPromise;
+      runtimeInitializationPromise = undefined;
       runtimeInitialized = true;
     }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 38caa9076e3c0..2659b471733f5 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -143,6 +143,21 @@ export const setSessionOptions = (options?: InferenceSession.SessionOptions): [n
       setExecutionProviders(sessionOptionsHandle, sessionOptions.executionProviders, allocs);
     }
 
+    if (sessionOptions.freeDimensionOverrides) {
+      for (const [name, value] of Object.entries(sessionOptions.freeDimensionOverrides)) {
+        if (typeof name !== 'string') {
+          throw new Error(`free dimension override name must be a string: ${name}`);
+        }
+        if (typeof value !== 'number' || !Number.isInteger(value) || value < 0) {
+          throw new Error(`free dimension override value must be a non-negative integer: ${value}`);
+        }
+        const nameOffset = allocWasmString(name, allocs);
+        if (wasm._OrtAddFreeDimensionOverride(sessionOptionsHandle, nameOffset, value) !== 0) {
+          checkLastError(`Can't set a free dimension override: ${name} - ${value}.`);
+        }
+      }
+    }
+
     if (sessionOptions.extra !== undefined) {
       iterateExtraOptions(sessionOptions.extra, '', new WeakSet<Record<string, unknown>>(), (key, value) => {
         const keyDataOffset = allocWasmString(key, allocs);
diff --git a/js/web/script/prepack.ts b/js/web/script/prepack.ts
index be86c5687bec0..4c5941d8dae12 100644
--- a/js/web/script/prepack.ts
+++ b/js/web/script/prepack.ts
@@ -11,7 +11,7 @@ function updatePackageJson() {
   const packageCommon = fs.readJSONSync(commonPackageJsonPath);
   const packageSelf = fs.readJSONSync(selfPackageJsonPath);
   const version = packageCommon.version;
-  packageSelf.dependencies['onnxruntime-common'] = `~${version}`;
+  packageSelf.dependencies['onnxruntime-common'] = `${version}`;
   fs.writeJSONSync(selfPackageJsonPath, packageSelf, {spaces: 2});
   console.log('=== finished updating package.json.');
 }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index e34529fa1037d..7b41850948149 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -80,6 +80,7 @@ Options:
 
  --no-sandbox                  This flag will be passed to Chrome.
                                  Sometimes Chrome need this flag to work together with Karma.
+ --chromium-flags=<...>        This flag will be passed to Chrome and Edge browsers. Can be used multiple times.
 
 Examples:
 
@@ -173,6 +174,7 @@ export interface TestRunnerCliArgs {
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
+  chromiumFlags: string[];
 }
 
 
@@ -439,6 +441,17 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
 
+  // parse chromium flags
+  let chromiumFlags = args['chromium-flags'];
+  if (!chromiumFlags) {
+    chromiumFlags = [];
+  } else if (typeof chromiumFlags === 'string') {
+    chromiumFlags = [chromiumFlags];
+  } else if (!Array.isArray(chromiumFlags)) {
+    throw new Error(`Invalid command line arg: --chromium-flags: ${chromiumFlags}`);
+  }
+
+
   npmlog.verbose('TestRunnerCli.Init', ` Mode:              ${mode}`);
   npmlog.verbose('TestRunnerCli.Init', ` Env:               ${env}`);
   npmlog.verbose('TestRunnerCli.Init', ` Debug:             ${debug}`);
@@ -462,6 +475,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     webglOptions,
     wasmOptions,
     globalEnvFlags,
-    noSandbox
+    noSandbox,
+    chromiumFlags
   };
 }
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index fa4312ee0aaf3..520ef62b2c719 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -475,10 +475,12 @@ async function main() {
           args.bundleMode === 'perf' ? 'perf' :
               args.debug             ? 'debug' :
                                        'test',
-          webgpu, webnn, config.options.globalEnvFlags?.webgpu?.profilingMode === 'default');
+          webgpu, webnn);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
+      const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
       if (args.debug) {
         karmaArgs.push('--log-level info --timeout-mocha 9999999');
+        chromiumFlags.push('--remote-debugging-port=9333');
       } else {
         karmaArgs.push('--single-run');
       }
@@ -488,7 +490,22 @@ async function main() {
       if (webgpu || webnn) {
         karmaArgs.push('--force-localhost');
       }
+      if (webgpu) {
+        if (browser.includes('Canary')) {
+          chromiumFlags.push('--enable-dawn-features=allow_unsafe_apis,use_dxc');
+        } else {
+          chromiumFlags.push('--enable-dawn-features=use_dxc');
+          chromiumFlags.push('--disable-dawn-features=disallow_unsafe_apis');
+        }
+      }
+      if (webnn) {
+        chromiumFlags.push('--enable-experimental-web-platform-features');
+      }
+      if (config.options.globalEnvFlags?.webgpu?.profilingMode === 'default') {
+        chromiumFlags.push('--disable-dawn-features=disallow_unsafe_apis');
+      }
       karmaArgs.push(`--bundle-mode=${args.bundleMode}`);
+      karmaArgs.push(...chromiumFlags.map(flag => `--chromium-flags=${flag}`));
       if (browser.startsWith('Edge')) {
         // There are currently 2 Edge browser launchers:
         //  - karma-edge-launcher: used to launch the old Edge browser
@@ -580,12 +597,12 @@ async function main() {
   }
 
   function getBrowserNameFromEnv(
-      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean, profile: boolean) {
+      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu, webnn, profile);
+        return selectChromeBrowser(mode, webgpu, webnn);
       case 'edge':
-        return webgpu ? 'EdgeWebGpuTest' : 'Edge';
+        return 'EdgeTest';
       case 'firefox':
         return 'Firefox';
       case 'electron':
@@ -599,25 +616,14 @@ async function main() {
     }
   }
 
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean, profile: boolean) {
-    if (webgpu) {
-      switch (mode) {
-        case 'debug':
-          return profile ? 'ChromeWebGpuProfileDebug' : 'ChromeDebug';
-        default:
-          return profile ? 'ChromeWebGpuProfileTest' : 'ChromeTest';
-      }
-    } else if (webnn) {
-      switch (mode) {
-        case 'debug':
-          return 'ChromeCanaryDebug';
-        default:
-          return 'ChromeCanaryTest';
-      }
+  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+    if (webnn) {
+      return 'ChromeCanaryTest';
+    } else if (webgpu) {
+      return 'ChromeTest';
     } else {
       switch (mode) {
         case 'debug':
-          return 'ChromeDebug';
         case 'perf':
           return 'ChromeTest';
         default:
diff --git a/js/web/test/data/ops/concat_int32.jsonc b/js/web/test/data/ops/concat_int32.jsonc
new file mode 100644
index 0000000000000..6e2ce18c6f7c5
--- /dev/null
+++ b/js/web/test/data/ops/concat_int32.jsonc
@@ -0,0 +1,406 @@
+[
+  {
+    "name": "Concat 2D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+              16
+            ],
+            "dims": [8, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6],
+            "dims": [4, 3],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 2D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[4,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+            "dims": [4, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16, 13, 14, 15,
+              16
+            ],
+            "dims": [4, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 5, 6, 1, 2, 3, 4, 3, 4, 7, 8, 5, 6, 7, 8],
+            "dims": [2, 8],
+            "type": "int32"
+          }
+        ]
+      },
+      {
+        "name": "[2,3]",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [2, 3],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6],
+            "dims": [2, 6],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 3D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,4]",
+        "inputs": [
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16],
+            "dims": [2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16
+            ],
+            "dims": [2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=0",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 0, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26,
+              29, 30, 27, 28, 31, 32
+            ],
+            "dims": [4, 2, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=1",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 1, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 17, 18, 21, 22, 19, 20, 23, 24, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 4, 2, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=2",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 2, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 9, 10, 13, 14, 11, 12, 15,
+              16, 17, 18, 21, 22, 19, 20, 23, 24, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32, 25,
+              26, 29, 30, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 4, 4],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Concat 4D axis=3",
+    "operator": "Concat",
+    "attributes": [{ "name": "axis", "data": 3, "type": "int" }],
+    "cases": [
+      {
+        "name": "[2,2,2,4]",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          },
+          {
+            "data": [
+              1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16, 17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27,
+              28, 31, 32
+            ],
+            "dims": [2, 2, 2, 4],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              1, 2, 5, 6, 1, 2, 5, 6, 3, 4, 7, 8, 3, 4, 7, 8, 9, 10, 13, 14, 9, 10, 13, 14, 11, 12, 15, 16, 11, 12, 15,
+              16, 17, 18, 21, 22, 17, 18, 21, 22, 19, 20, 23, 24, 19, 20, 23, 24, 25, 26, 29, 30, 25, 26, 29, 30, 27,
+              28, 31, 32, 27, 28, 31, 32
+            ],
+            "dims": [2, 2, 2, 8],
+            "type": "int32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
new file mode 100644
index 0000000000000..6a4e6912405ee
--- /dev/null
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -0,0 +1,79 @@
+[
+  {
+    "name": "Simple test with NHWC",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [4],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.6583645343780518, 3.552788257598877, 4.447211742401123, 5.341635704040527, 2.3167295455932617,
+              4.105576515197754, 5.8944244384765625, 7.683271408081055, 6, 10.242595672607422, 6, 1.7574005126953125,
+              12.36654281616211, 8.788846969604492, 5.211153030395508, 1.633458137512207
+            ],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index f53da708b8f6f..94592884ccad6 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -257,6 +257,7 @@
       "greater.jsonc",
       //"identity.jsonc",
       "image-scaler.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
@@ -432,18 +433,18 @@
       // // "test_compress_1",
       // // "test_compress_default_axis",
       // // "test_compress_negative_axis",
-      // "test_concat_1d_axis_0",
-      // "test_concat_1d_axis_negative_1",
-      // "test_concat_2d_axis_0",
-      // "test_concat_2d_axis_1",
-      // "test_concat_2d_axis_negative_1",
-      // "test_concat_2d_axis_negative_2",
-      // "test_concat_3d_axis_0",
-      // "test_concat_3d_axis_1",
-      // "test_concat_3d_axis_2",
-      // "test_concat_3d_axis_negative_1",
-      // "test_concat_3d_axis_negative_2",
-      // "test_concat_3d_axis_negative_3",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
       "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
@@ -505,7 +506,7 @@
       // // "test_dynamicquantizelinear_min_adjusted_expanded",
       // // "test_dynamicquantizelinear_min_adjusted",
       // // "test_dynamicquantizelinear",
-      // // "test_edge_pad",
+      "test_edge_pad",
       // "test_einsum_batch_diagonal",
       // "test_einsum_batch_matmul",
       // "test_einsum_inner_prod",
@@ -965,7 +966,7 @@
       "test_reduce_sum_square_keepdims_random",
       "test_reduce_sum_square_negative_axes_keepdims_example",
       "test_reduce_sum_square_negative_axes_keepdims_random",
-      // // "test_reflect_pad",
+      "test_reflect_pad",
       "test_relu",
       // "test_reshape_allowzero_reordered",
       "test_reshape_extended_dims",
@@ -1308,7 +1309,8 @@
       "test_unsqueeze_three_axes",
       "test_unsqueeze_two_axes",
       "test_unsqueeze_unsorted_axes",
-      "test_unsqueeze"
+      "test_unsqueeze",
+      "test_wrap_pad"
       // "test_upsample_nearest",
       // "test_where_example",
       // "test_where_long_example",
@@ -1330,7 +1332,8 @@
       //"and.jsonc",
       "asin.jsonc",
       "ceil.jsonc",
-      //"concat.jsonc",
+      "concat.jsonc",
+      "concat_int32.jsonc",
       "cast.jsonc",
       "conv.jsonc",
       "cos.jsonc",
@@ -1345,6 +1348,7 @@
       "gemm.jsonc",
       "global-average-pool.jsonc",
       "greater.jsonc",
+      "instance-norm.jsonc",
       "less.jsonc",
       "log.jsonc",
       "matmul.jsonc",
@@ -1360,8 +1364,8 @@
       "reduce-min.jsonc",
       "relu.jsonc",
       "gelu.jsonc",
-      //"pad.jsonc",
-      //"pad-big.jsonc",
+      "pad.jsonc",
+      "pad-big.jsonc",
       "pow.jsonc",
       "pow_int32.jsonc",
       "pow-big-number.jsonc",
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index e614cc8e67e71..49d0ac225be2f 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,9 +110,9 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
               test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
         });
 
-        after('release session', () => {
+        after('release session', async () => {
           if (context) {
-            context.release();
+            await context.release();
           }
         });
 
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 9802f00f7a866..46d80a9f56f35 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -210,11 +210,12 @@ export class ModelTestContext {
     Logger.verbose('TestRunner.Perf', '***Perf Data End');
   }
 
-  release(): void {
+  async release(): Promise<void> {
     if (this.profile) {
       this.session.endProfiling();
     }
     this.logPerfData();
+    await this.session.release();
   }
 
   /**
diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h
index b6569a21e4c91..aae70d85814bc 100644
--- a/onnxruntime/core/framework/tuning_context.h
+++ b/onnxruntime/core/framework/tuning_context.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <array>
 #include <unordered_map>
 
 #include "core/common/common.h"
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 565afcc67e7df..02a7fb733813c 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -330,6 +330,31 @@ bool WhereNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node&
          dt_input_1 == dt_output;
 }
 
+bool PadNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node,
+                                 const std::vector<const Node*>& dq_nodes,
+                                 const std::vector<const Node*>& q_nodes) const {
+  // Pad can have 1 or 2 dq input, the optional input constant_value can be quantized or non-quantized.
+  // QNN supports data input quantized with constant_value input non-quantized.
+  int num_dq_inputs = static_cast<int>(dq_nodes.size());
+  if (num_dq_inputs > 2) {
+    return false;
+  }
+
+  if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, num_dq_inputs)) {
+    return false;
+  }
+
+  const int32_t dt_input_1 = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  const int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  if (dq_nodes.size() > 1) {
+    const int32_t dt_input_2 = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    return dt_input_1 == dt_input_2 &&
+           dt_input_1 == dt_output;
+  } else {
+    return dt_input_1 == dt_output;
+  }
+}
+
 bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
                                                            const Node& node,
                                                            const std::vector<const Node*>& dq_nodes,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index ab9ad45697dfa..58ebf81508962 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -110,6 +110,16 @@ class WhereNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+class PadNodeGroupSelector : public NodeGroupSelector {
+ public:
+  PadNodeGroupSelector() = default;
+
+ private:
+  bool Check(const GraphViewer& graph_viewer, const Node& node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
 // The lack of a trailing Q isn't really a QDQ node group, so we default support for that to off.
 class MatMulNodeGroupSelector : public NodeGroupSelector {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 7783d3b3f36b7..f1bdd7a99c329 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -123,6 +123,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetLogicalComparisonOpVersions
 static const OpVersionsAndSelector::OpVersionsMap GetWhereOpVersionsMap() {
   return {{"Where", {}}};
 }
+static const OpVersionsAndSelector::OpVersionsMap GetPadOpVersionsMap() {
+  return {{"Pad", {}}};
+}
 
 /* Selector rules registration related */
 void RegisterMiscSelectors(Selectors& qdq_selectors) {
@@ -217,6 +220,13 @@ void RegisterWhereSelectors(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterPadSelectors(Selectors& qdq_selectors) {
+  /* register selectors for Pad ops */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<PadNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetPadOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void SelectorManager::CreateSelectors() {
   RegisterMiscSelectors(qdq_selectors_);
   RegisterDropDQSelectors(qdq_selectors_);
@@ -231,6 +241,7 @@ void SelectorManager::CreateSelectors() {
   RegisterBatchNormalizationSelector(qdq_selectors_);
   RegisterLogicalComparisonSelectors(qdq_selectors_);
   RegisterWhereSelectors(qdq_selectors_);
+  RegisterPadSelectors(qdq_selectors_);
 }
 
 void SelectorManager::InitializeSelectorsMap() {
diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
index 330b92d4a8f78..27407e999945a 100644
--- a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
+++ b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc
@@ -201,7 +201,16 @@ class Utf8Converter {
 
 #endif
 
-const std::string default_locale("en_US.UTF-8");  // All non-MS
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE || TARGET_OS_SIMULATOR
+const std::string default_locale("en-US.UTF-8");
+#else
+const std::string default_locale("en_US.UTF-8");  // Other kinds of Apple Platforms including MacOS, etc
+#endif
+#else
+const std::string default_locale("en_US.UTF-8");  // All non-MS and not Apple
+#endif
 
 #endif  // _MSC_VER
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
index 232a022d869f4..04381b6ce355c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -80,7 +80,7 @@ namespace Windows::AI::MachineLearning::Adapter
     // Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size.
     struct DmlGraphNodeCreateInfo
     {
-        uint32_t nodeCount;
+        uint32_t nodeCount = 0;
         std::vector<std::unique_ptr<AbstractOperatorDesc>> nodesAsOperatorDesc;
         std::vector<Microsoft::WRL::ComPtr<IDMLOperator>> nodesAsIDMLOperator;
         std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 5dbea41901b80..c24257071eda5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -212,15 +212,6 @@ namespace Dml
             ORT_THROW_HR(E_INVALIDARG);
         }
         const auto* allocInfo = static_cast<const AllocationInfo*>(opaqueHandle);
-
-        auto owner = allocInfo->GetOwner();
-        //The owner can be null if the resource was wrapped via CreateGPUAllocationFromD3DResource
-        if (owner != nullptr && owner != this)
-        {
-            // This allocation doesn't belong to this allocator!
-            ORT_THROW_HR(E_INVALIDARG);
-        }
-
         return allocInfo;
     }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
index 4c24cb174f6ed..196fba5d7689d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h
@@ -83,16 +83,16 @@ namespace Dml
         std::vector<Bucket> m_pool;
         size_t m_currentAllocationId = 0;
         uint64_t m_currentResourceId = 0;
-        
-        // Unless specifically requested, allocation sizes are not rounded to enable pooling 
-        // until SetDefaultRoundingMode is called.  This should be done at completion of session 
+
+        // Unless specifically requested, allocation sizes are not rounded to enable pooling
+        // until SetDefaultRoundingMode is called.  This should be done at completion of session
         // initialization.
         AllocatorRoundingMode m_defaultRoundingMode = AllocatorRoundingMode::Disabled;
 
         std::shared_ptr<ExecutionContext> m_context;
         std::unique_ptr<DmlSubAllocator> m_subAllocator;
 
-    #if _DEBUG
+    #ifndef NDEBUG
         // Useful for debugging; keeps track of all allocations that haven't been freed yet
         std::map<size_t, AllocationInfo*> m_outstandingAllocationsById;
     #endif
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
index a9d19a022d3e7..4813707cdf50c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@@ -38,6 +38,16 @@ namespace Dml
         bool& modified,
         int graph_level,
         const onnxruntime::logging::Logger& logger) const
+    {
+        return ApplyImplHelper(graph, modified, graph_level, logger, {});
+    }
+
+    onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const
     {
         onnxruntime::ProviderType provider_type = onnxruntime::kDmlExecutionProvider;
         const gsl::not_null<const onnxruntime::KernelRegistry*> registry = m_providerImpl->GetKernelRegistry().get();
@@ -49,6 +59,30 @@ namespace Dml
         std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
         std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
 
+        onnxruntime::GraphViewer graph_viewer(graph);
+        const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+
+        for (auto node_index : node_topology_list)
+        {
+            auto* node = graph.GetNode(node_index);
+            if (!node)
+            {
+                continue;  // node was removed
+            }
+
+            std::unordered_map<std::string, const onnxruntime::NodeArg*> subgraphImplicitInputDefs;
+            for (const onnxruntime::NodeArg* inputDef : node->ImplicitInputDefs())
+            {
+                subgraphImplicitInputDefs[inputDef->Name()] = inputDef;
+            }
+
+            for (auto& entry : node->GetAttributeNameToMutableSubgraphMap())
+            {
+                auto& subgraph = *entry.second;
+                ORT_RETURN_IF_ERROR(ApplyImplHelper(subgraph, modified, graph_level + 1, logger, subgraphImplicitInputDefs));
+            }
+        }
+
         do
         {
             // Initializers needed by any graph partition
@@ -62,7 +96,8 @@ namespace Dml
                 m_providerImpl->GetSupportedDeviceDataTypeMask(),
                 graphNodePropertyMap,
                 requiredInitializerMap,
-                additionalSplittingNodes);
+                additionalSplittingNodes,
+                implicitInputDefs);
 
             // Reset the splitting nodes for the current iteration
             additionalSplittingNodes.clear();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
index b546f29f59719..19dab0c89943c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -2,32 +2,41 @@
 // Licensed under the MIT License.
 #pragma once
 
-
+#include <string>
+#include <unordered_map>
 #include "core/optimizer/graph_transformer.h"
 #include "core/framework/execution_providers.h"
 
 namespace Dml
 {
-	class ExecutionProviderImpl;
+class ExecutionProviderImpl;
+
+class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
+{
+public:
+    DmlGraphFusionTransformer(
+        const std::string& name,
+        const onnxruntime::IExecutionProvider* provider
+    );
+
+public:
+    static inline const char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
+    static inline const char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
 
-	class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
-	{
-	public:
-		DmlGraphFusionTransformer(
-			const std::string& name,
-			const onnxruntime::IExecutionProvider* provider
-		);
+private:
+    onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph,
+                                            bool& modified,
+                                            int graph_level,
+                                            const onnxruntime::logging::Logger& logger) const final;
 
-	public:
-		inline const static char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
-		inline const static char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
+    onnxruntime::common::Status ApplyImplHelper(
+        onnxruntime::Graph& graph,
+        bool& modified,
+        int graph_level,
+        const onnxruntime::logging::Logger& logger,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputDefs) const;
 
-	private:
-		onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph, 
-											  bool& modified, 
-											  int graph_level, 
-											  const onnxruntime::logging::Logger& logger) const final;
-	private:
-		const ExecutionProviderImpl* m_providerImpl = nullptr;
-	};
+private:
+    const ExecutionProviderImpl* m_providerImpl = nullptr;
+};
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
index 2c8d4e4459f7f..18943878ccedc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@@ -345,13 +345,8 @@ namespace Dml
     // Whether any operator in the model contains a subgraph.  This is true
     // if the graph being partitioned is itself within a subgraph, or contains
     // an operator with a subgraph.
-    bool ModelUsesSubgraph(const onnxruntime::GraphViewer& graph)
+    bool ContainsSubgraph(const onnxruntime::GraphViewer& graph)
     {
-        if (graph.IsSubgraph())
-        {
-            return true;
-        }
-
         const std::vector<onnxruntime::NodeIndex>& toplogicalOrder = graph.GetNodesInTopologicalOrder();
 
         for (size_t nodeIndex : toplogicalOrder)
@@ -384,7 +379,8 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes)
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs)
     {
         // Nodes are uniquely identified by the name of their first output argument
         std::vector<std::unique_ptr<GraphPartition>> partitions;
@@ -419,7 +415,7 @@ namespace Dml
         }
 
         // Check whether this graph is a subgraph, or contains any node with a subgraph.
-        bool modelUsesSubgraph = ModelUsesSubgraph(graph);
+        bool containsSubgraph = ContainsSubgraph(graph);
 
         uint32_t splittingNodeIndex = 0;
 
@@ -454,10 +450,10 @@ namespace Dml
             // Add a unique partition if graph node usage is not supported.
             //
             // Partitioning is disabled in models with subgraphs to work around issues with implicit inputs.
-            // The partitioning algorithm does not currently consider such inputs.  Transfering shared initializers
+            // The partitioning algorithm does not currently consider such inputs. Transferring shared initializers
             // for partitions could also cause problems.  Note, operators with subgraphs are currently not efficient
             // anyhow due to CPU/GPU copies.
-            if (modelUsesSubgraph || !isDmlGraphNode)
+            if (containsSubgraph || !isDmlGraphNode)
             {
                 partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, false, nodeNameToPartitionMap));
                 continue;
@@ -505,7 +501,7 @@ namespace Dml
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
 
-                        if (graphInputs.find(arg->Name()) != graphInputs.end())
+                        if (graphInputs.find(arg->Name()) != graphInputs.end() || implicitInputs.find(arg->Name()) != implicitInputs.end())
                         {
                             firstNonFinalInputPartition->AddInput(arg->Name());
                         }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
index 990ba00fc4672..37d577f647fb5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
 #include "core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h"
 
 namespace Dml
@@ -48,5 +50,6 @@ namespace Dml
         uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
         std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
         std::unordered_set<std::string>& requiredInitializerMap,
-        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes);
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes,
+        const std::unordered_map<std::string, const onnxruntime::NodeArg*>& implicitInputs);
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index e9c63cc72a837..f94270cfadb8b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -5,6 +5,7 @@
 
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
+#include "core/common/gsl.h"
 
 #ifdef ORT_NO_EXCEPTIONS
 #define ML_CHECK_BOOL(x) ORT_THROW_HR_IF(E_INVALIDARG, !(x))
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index a46f820c6207f..fde61e73c2124 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -128,21 +128,13 @@ Microsoft::WRL::ComPtr<ID3D12Device> DMLProviderFactoryCreator::CreateD3D12Devic
   return d3d12_device;
 }
 
-std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
-  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
-
-  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
-  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
-  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
-
-  ComPtr<ID3D12CommandQueue> cmd_queue;
-  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
-
+Microsoft::WRL::ComPtr<IDMLDevice> DMLProviderFactoryCreator::CreateDMLDevice(ID3D12Device* d3d12_device)
+{
   DML_CREATE_DEVICE_FLAGS flags = DML_CREATE_DEVICE_FLAG_NONE;
 
   // In debug builds, enable the DML debug layer if the D3D12 debug layer is also enabled
 #if _DEBUG && !_GAMING_XBOX
-  ComPtr<ID3D12DebugDevice> debug_device;
+  Microsoft::WRL::ComPtr<ID3D12DebugDevice> debug_device;
   (void)d3d12_device->QueryInterface(IID_PPV_ARGS(&debug_device));  // ignore failure
   const bool is_d3d12_debug_layer_enabled = (debug_device != nullptr);
 
@@ -151,12 +143,27 @@ std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int
   }
 #endif
 
-  ComPtr<IDMLDevice> dml_device;
-  ORT_THROW_IF_FAILED(DMLCreateDevice1(d3d12_device.Get(),
-                                   flags,
-                                   DML_FEATURE_LEVEL_5_0,
-                                   IID_PPV_ARGS(&dml_device)));
+  Microsoft::WRL::ComPtr<IDMLDevice> dml_device;
+  ORT_THROW_IF_FAILED(DMLCreateDevice1(
+      d3d12_device,
+      flags,
+      DML_FEATURE_LEVEL_5_0,
+      IID_PPV_ARGS(&dml_device)));
+
+  return dml_device;
+}
+
+std::shared_ptr<IExecutionProviderFactory> DMLProviderFactoryCreator::Create(int device_id, bool skip_software_device_check) {
+  ComPtr<ID3D12Device> d3d12_device = CreateD3D12Device(device_id, skip_software_device_check);
+
+  D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+  cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+  ComPtr<ID3D12CommandQueue> cmd_queue;
+  ORT_THROW_IF_FAILED(d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_GRAPHICS_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
 
+  auto dml_device = CreateDMLDevice(d3d12_device.Get());
   return CreateExecutionProviderFactory_DML(dml_device.Get(), cmd_queue.Get());
 }
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
index b1c9bb3f6f679..574f4410fe3e3 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
+++ b/onnxruntime/core/providers/dml/dml_provider_factory_creator.h
@@ -16,5 +16,6 @@ struct DMLProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id);
   static std::shared_ptr<IExecutionProviderFactory> Create(int device_id, bool skip_software_device_check);
   static Microsoft::WRL::ComPtr<ID3D12Device> CreateD3D12Device(int device_id, bool skip_software_device_check);
+  static Microsoft::WRL::ComPtr<IDMLDevice> CreateDMLDevice(ID3D12Device* d3d12_device);
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 00998b8559e64..266aba45f6639 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -321,6 +321,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 2, 10, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Pad);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Pad);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Pad);
+
 std::unique_ptr<KernelRegistry> RegisterKernels() {
   auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
 
@@ -577,6 +583,12 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
 
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, float, Einsum)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Pad)>,
+
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/js/operators/concat.cc b/onnxruntime/core/providers/js/operators/concat.cc
index 7d50d78c82851..3a6a7e1cafd7a 100644
--- a/onnxruntime/core/providers/js/operators/concat.cc
+++ b/onnxruntime/core/providers/js/operators/concat.cc
@@ -12,7 +12,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     1, 3,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -21,7 +22,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     4, 10,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
@@ -30,7 +32,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     11, 12,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -39,7 +42,8 @@ ONNX_OPERATOR_KERNEL_EX(
     13,
     kJsExecutionProvider,
     (*KernelDefBuilder::Create())
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<int32_t>()}),
     Concat);
 
 }  // namespace js
diff --git a/onnxruntime/core/providers/js/operators/pad.cc b/onnxruntime/core/providers/js/operators/pad.cc
new file mode 100644
index 0000000000000..24ba85cbf6e0d
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pad.cc
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+#include "pad.h"
+
+namespace onnxruntime {
+namespace js {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    2,
+    10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    11,
+    12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    13,
+    17,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    18,
+    18,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Pad,
+    kOnnxDomain,
+    19,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
+        .InputMemoryType(OrtMemTypeCPU, 1)
+        .InputMemoryType(OrtMemTypeCPU, 2)
+        .InputMemoryType(OrtMemTypeCPU, 3),
+    Pad);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
new file mode 100644
index 0000000000000..19168f40b4722
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+#include "core/providers/cpu/tensor/padbase.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Pad : public JsKernel, public PadBase {
+ public:
+  explicit Pad(const OpKernelInfo& info) : JsKernel(info), PadBase(info) {
+    std::vector<int32_t> pads;
+    if (!is_dynamic_) {
+      pads.resize(pads_.size());
+      for (size_t i = 0; i < pads_.size(); ++i) {
+        pads[i] = gsl::narrow_cast<int32_t>(pads_[i]);
+      }
+    }
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(Pad, ({"mode" : $1,
+                                      "value" : $2,
+                                      "pads" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                               static_cast<int32_t>(mode_),
+                               static_cast<double>(value_),
+                               gsl::narrow_cast<int32_t>(pads.size()),
+                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 421c55a2c91a8..766034b3decea 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -8,6 +8,7 @@
 #include "core/common/inlined_containers.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksTypes.h"
+#include "core/common/gsl.h"
 
 // This is the minimal Android API Level required by ORT NNAPI EP to run
 // ORT running on any host system with Android API level less than this will fall back to CPU EP
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 58ac3ad45a577..fc8c2efc7a80f 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -154,6 +154,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateTransposeOpBuilder("Transpose", *this);
   }
+
+  {
+    CreatePadOpBuilder("Pad", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index 36cf0e7ff5ac0..5d59f4343d773 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -88,5 +88,7 @@ void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_r
 
 void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 14d5e45799b81..0431d605bc843 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -162,7 +162,9 @@ class BaseOpBuilder : public IOpBuilder {
         {"BatchNormalization", QNN_OP_BATCHNORM},
         {"LayerNormalization", QNN_OP_LAYER_NORM},
 
-        {"LRN", QNN_OP_LRN}};
+        {"LRN", QNN_OP_LRN},
+
+        {"Pad", QNN_OP_PAD}};
     auto it = onnx_op_type_to_qnn_op_type.find(onnx_op_type);
     ORT_ENFORCE(it != onnx_op_type_to_qnn_op_type.end());
     return it->second;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index bd07c099b3cfe..e203667576447 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -13,7 +13,6 @@
 namespace onnxruntime {
 namespace qnn {
 
-// Operator which only need to hanle node inputs & outputs, no attributes or no need to handle attributes
 class GatherOpBuilder : public BaseOpBuilder {
  public:
   GatherOpBuilder() : BaseOpBuilder("GatherOpBuilder") {}
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
new file mode 100644
index 0000000000000..2dfdfffe5fa54
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -0,0 +1,247 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/cpu/tensor/slice_helper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/common/safeint.h"
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+
+namespace onnxruntime {
+namespace qnn {
+class PadOpBuilder : public BaseOpBuilder {
+ public:
+  PadOpBuilder() : BaseOpBuilder("PadOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PadOpBuilder);
+
+ protected:
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+Status PadOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                   const NodeUnit& node_unit,
+                                   const logging::Logger& logger,
+                                   std::vector<std::string>& input_names,
+                                   bool do_op_validation) const {
+  const auto& inputs = node_unit.Inputs();
+  // QNN Pad only has 1 input, the pads input & constant_value input need to be initializer and set as Qnn node parameter, axes input is not supported.
+  if (do_op_validation) {
+    ORT_RETURN_IF(inputs.size() > 3, "QNN Pad doesn't support axes.");
+    ORT_RETURN_IF(inputs.size() < 2, "QNN Pad requires the pads input.");
+
+    std::vector<uint32_t> input_shape;
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
+    ORT_RETURN_IF(input_shape.size() > 5, "QNN Pad doesn't support more than 5 dimension");
+
+    auto& pads_input_name = inputs[1].node_arg.Name();
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.IsInitializerInput(pads_input_name),
+                      "Qnn doesn't support dynamic pad input");
+    if (node_unit.Inputs().size() > 2) {
+      auto& constant_value_input_name = inputs[2].node_arg.Name();
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.IsInitializerInput(constant_value_input_name),
+                        "Qnn doesn't support dynamic constant_value input");
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+
+  return Status::OK();
+}
+
+template <typename T>
+float DequantizeValue(T value, int32_t offset, float scale) {
+  return static_cast<float>(static_cast<int32_t>(value) - offset) * scale;
+}
+
+Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper,
+                            std::vector<std::string>& param_tensor_names,
+                            const NodeUnit& node_unit,
+                            const NodeUnitIODef& input) {
+  OnnxInputInfo input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(input, input_info));
+  std::vector<uint8_t> unpacked_tensor;
+  // Already confirmed constant_value input is initializer in ProcessInputs()
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
+  Qnn_Scalar_t constant_value_qnn_scalar = QNN_SCALAR_INIT;
+  // constant_value is quantized
+  if (input.quant_param.has_value()) {
+    // QNN prefers pad_constant_value quantized with quantization params same as in[0], and data stored as 32-bit signed integer
+    // Onnx doesn't guarantee it has same quantization parameter as in[0], so get back the float32 value and use non-quantized data directly
+    constant_value_qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
+    float constant_value = 0;
+    switch (input_info.qnn_data_type) {
+      case QNN_DATATYPE_SFIXED_POINT_8: {
+        auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int8_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_SFIXED_POINT_16: {
+        auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int16_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_SFIXED_POINT_32: {
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(int32_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_8: {
+        constant_value = DequantizeValue(unpacked_tensor.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_16: {
+        auto uint16_span = ReinterpretAsSpan<const uint16_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(uint16_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      case QNN_DATATYPE_UFIXED_POINT_32: {
+        auto uint32_span = ReinterpretAsSpan<const uint32_t>(gsl::make_span(unpacked_tensor));
+        constant_value = DequantizeValue(uint32_span.data()[0],
+                                         input_info.quant_param.scaleOffsetEncoding.offset,
+                                         input_info.quant_param.scaleOffsetEncoding.scale);
+        break;
+      }
+      default:
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Pad constant_value.");
+    }
+    constant_value_qnn_scalar.floatValue = constant_value;
+  } else {  // constant_value is non-quantized
+    constant_value_qnn_scalar.dataType = input_info.qnn_data_type;
+    switch (input_info.qnn_data_type) {
+      case QNN_DATATYPE_UINT_8: {
+        constant_value_qnn_scalar.uint8Value = unpacked_tensor.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_8: {
+        auto int8_span = ReinterpretAsSpan<const int8_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int8Value = int8_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_16: {
+        auto int16_span = ReinterpretAsSpan<const int16_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int16Value = int16_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_32: {
+        auto int32_span = ReinterpretAsSpan<const int32_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int32Value = int32_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_INT_64: {
+        auto int64_span = ReinterpretAsSpan<const int64_t>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.int64Value = int64_span.data()[0];
+        break;
+      }
+      case QNN_DATATYPE_FLOAT_32: {
+        auto float_span = ReinterpretAsSpan<const float>(gsl::make_span(unpacked_tensor));
+        constant_value_qnn_scalar.floatValue = float_span.data()[0];
+        break;
+      }
+      default:
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported.");
+    }  // switch
+  }    // if-else
+
+  QnnParamWrapper constant_value_param(node_unit.Index(),
+                                       node_unit.Name(),
+                                       QNN_OP_PAD_PARAM_PAD_CONSTANT_VALUE,
+                                       constant_value_qnn_scalar);
+  param_tensor_names.push_back(constant_value_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(constant_value_param));
+
+  return Status::OK();
+}
+
+Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                 const NodeUnit& node_unit,
+                                                 std::vector<std::string>&& input_names,
+                                                 const logging::Logger& logger,
+                                                 bool do_op_validation) const {
+  std::vector<std::string> param_tensor_names;
+  // Process pads input
+  // Already confirmed pads input is initializer in ProcessInputs()
+  const auto& inputs = node_unit.Inputs();
+  const auto& pads_input_name = inputs[1].node_arg.Name();
+
+  std::vector<uint8_t> unpacked_tensor;
+  const auto& input_tensor = qnn_model_wrapper.GetInitializerTensors().at(pads_input_name);
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_tensor, unpacked_tensor));
+  // Onnx Pads are int64, Qnn use uint32
+  const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
+  size_t tensor_byte_size = unpacked_tensor.size();
+  size_t size = tensor_byte_size / sizeof(int64_t);
+
+  std::vector<uint32_t> pad_amount;
+  std::transform(tensor_data, tensor_data + size, std::back_inserter(pad_amount),
+                 [](int64_t item) { return SafeInt<uint32_t>(item); });
+  // Onnx format is begin_0, begin_1, ..., end_0, end_1, ...
+  // Qnn format is begin_0, end_0, begin_1, end_1, ...
+  ReArranagePads(pad_amount);
+
+  std::vector<uint32_t> pad_amount_dim{static_cast<uint32_t>(pad_amount.size() / 2), static_cast<uint32_t>(2)};
+  QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT, std::move(pad_amount_dim),
+                                  std::move(pad_amount));
+  param_tensor_names.push_back(multiples_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(multiples_param));
+
+  // Process optional input constant_value
+  if (node_unit.Inputs().size() > 2) {
+    ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2]));
+  }  // constant_value
+
+  NodeAttrHelper node_helper(node_unit);
+  std::string mode = node_helper.Get("mode", "constant");
+  Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
+  mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
+  if ("constant" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_CONSTANT;
+  } else if ("reflect" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_MIRROR_REFLECT;
+  } else if ("edge" == mode) {
+    mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_EDGE;
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad mode only support constant.");
+  }
+
+  QnnParamWrapper mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_SCHEME, mode_qnn_scalar);
+  param_tensor_names.push_back(mode_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(mode_param));
+
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
+                                     std::move(input_names),
+                                     std::move(param_tensor_names),
+                                     logger, do_op_validation, GetQnnOpType(node_unit.OpType())));
+
+  return Status::OK();
+}
+
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<PadOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 8abb847b20b46..556a86bb1519b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -118,9 +118,9 @@ Status ProcessModeAttribute(QnnModelWrapper& qnn_model_wrapper,
   Qnn_Scalar_t mode_qnn_scalar = QNN_SCALAR_INIT;
   mode_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
   if ("DCR" == mode) {
-    mode_qnn_scalar.uint32Value = 0;
+    mode_qnn_scalar.uint32Value = QNN_OP_DEPTH_TO_SPACE_MODE_DCR;
   } else if ("CRD" == mode) {
-    mode_qnn_scalar.uint32Value = 1;  // CRD mode
+    mode_qnn_scalar.uint32Value = QNN_OP_DEPTH_TO_SPACE_MODE_CRD;  // CRD mode
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DepthToSpace mode only support DCR & CRD.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 1f54bda9107c7..22f8d3a0eaa64 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -117,6 +117,7 @@ class QnnModelWrapper {
     return input_index_map_.find(tensor_name) != input_index_map_.end();
   }
 
+  // TODO(hecli) rename to GetTensorInfo
   Status GetOnnxInputInfo(const NodeUnitIODef& input, OnnxInputInfo& input_info) const;
 
   Status AddReshapeNode(const std::string& input_name,
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e90417a6d14fc..96893f63b4540 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2433,6 +2433,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       Ort::KernelContext ctx(context);
 
       TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+      // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+      std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
@@ -2475,237 +2480,230 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         timing_cache_path = GetTimingCachePath(cache_path_, prop);
       }
 
-      // Following block is the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
-      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
-      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      {
-        std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
-
-        // Load serialized engine
-        if (trt_state->engine_cache_enable && trt_engine == nullptr) {
-          std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-          std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
-          if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
-            // Deserialize profile
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-
-            // Prepare buffer
-            engine_file.seekg(0, std::ios::end);
-            size_t engine_size = engine_file.tellg();
-            engine_file.seekg(0, std::ios::beg);
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            engine_file.read((char*)engine_buf.get(), engine_size);
-
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
-          } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
-            shape_ranges = DeserializeProfileV2(profile_file);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-            // Decrypt engine
-            size_t engine_size = 0;
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not get engine buffer size");
-            }
-            std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-            if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not call engine decryption function decrypt");
-            }
-            // Deserialize engine
-            // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-            // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-            trt_state->engine->reset();
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
-            if (*(trt_state->engine) == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-            }
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-            trt_engine = trt_state->engine->get();
-            context_update = true;
+      // Load serialized engine
+      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
+        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
+        if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
+          // Deserialize profile
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+
+          // Prepare buffer
+          engine_file.seekg(0, std::ios::end);
+          size_t engine_size = engine_file.tellg();
+          engine_file.seekg(0, std::ios::beg);
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          engine_file.read((char*)engine_buf.get(), engine_size);
+
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+          }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
+        } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
+          shape_ranges = DeserializeProfileV2(profile_file);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+          // Decrypt engine
+          size_t engine_size = 0;
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not get engine buffer size");
+          }
+          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not call engine decryption function decrypt");
+          }
+          // Deserialize engine
+          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+          trt_state->engine->reset();
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr));
+          if (*(trt_state->engine) == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
           }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+          trt_engine = trt_state->engine->get();
+          context_update = true;
         }
+      }
 
-        // Check and update shape ranges for dynamic shape inputs.
-        for (int i = 0, end = num_inputs; i < end; ++i) {
-          auto input = trt_state->network->get()->getInput(i);
-          const std::string& input_name = input->getName();
-          input_names.insert(input_name);
+      // Check and update shape ranges for dynamic shape inputs.
+      for (int i = 0, end = num_inputs; i < end; ++i) {
+        auto input = trt_state->network->get()->getInput(i);
+        const std::string& input_name = input->getName();
+        input_names.insert(input_name);
 
-          // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-          // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-          if (shape_ranges.find(input_name) != shape_ranges.end()) {
-            auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
-            if (status != Status::OK()) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
-            }
+        // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+        // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+        if (shape_ranges.find(input_name) != shape_ranges.end()) {
+          auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
+          if (status != Status::OK()) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
           }
         }
+      }
 
-        // Regenerate engine
-        if (engine_update) {
-          // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
-          if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
-            GetPerThreadContext().ResetTensorRTContext(fused_node_name);
-          }
+      // Regenerate engine
+      if (engine_update) {
+        // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
+        if (GetPerThreadContext().IsTensorRTContextInMap(fused_node_name)) {
+          GetPerThreadContext().ResetTensorRTContext(fused_node_name);
+        }
 
-          trt_state->engine->reset();
-          auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-          trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
-          for (auto trt_profile : trt_profiles) {
-            trt_config->addOptimizationProfile(trt_profile);
-          }
+        trt_state->engine->reset();
+        auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+        trt_config->setMaxWorkspaceSize(*(trt_state->max_workspace_size_ptr));
+        for (auto trt_profile : trt_profiles) {
+          trt_config->addOptimizationProfile(trt_profile);
+        }
 
-          // Set INT8 Per Tensor Dynamic range
-          if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
-            trt_config->setInt8Calibrator(nullptr);
-            if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
-            }
+        // Set INT8 Per Tensor Dynamic range
+        if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+          trt_config->setInt8Calibrator(nullptr);
+          if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
           }
+        }
 
-          // Set precision
-          if (trt_state->fp16_enable && trt_state->int8_enable) {
-            trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-          } else if (trt_state->fp16_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-          } else if (trt_state->int8_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-          }
+        // Set precision
+        if (trt_state->fp16_enable && trt_state->int8_enable) {
+          trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+        } else if (trt_state->fp16_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+        } else if (trt_state->int8_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+        }
 
-          // Set DLA (DLA can only run with FP16 or INT8)
-          if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-            trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-            trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-            trt_config->setDLACore(trt_state->dla_core);
-          }
+        // Set DLA (DLA can only run with FP16 or INT8)
+        if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+          trt_config->setDLACore(trt_state->dla_core);
+        }
 
-          // enable sparse weights
-          if (trt_state->sparsity_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-          }
+        // enable sparse weights
+        if (trt_state->sparsity_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+        }
 
-          // enable builder heuristics
-          if (trt_state->build_heuristics_enable) {
-            trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
-          }
+        // enable builder heuristics
+        if (trt_state->build_heuristics_enable) {
+          trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
+        }
 #if NV_TENSORRT_MINOR > 5 && NV_TENSORRT_MAJOR >= 8
-          // switch optimizaion level
-          if (trt_state->builder_optimization_level != 3) {
-            trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-          }
+        // switch optimizaion level
+        if (trt_state->builder_optimization_level != 3) {
+          trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+        }
 
-          // limit auxiliary streams
-          if (trt_state->auxiliary_streams >= 0) {
-            trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
-          }
+        // limit auxiliary streams
+        if (trt_state->auxiliary_streams >= 0) {
+          trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
+        }
 #else
-          if (trt_state->builder_optimization_level != 3) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-          }
-          if (trt_state->auxiliary_streams >= 0) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-          }
+        if (trt_state->builder_optimization_level != 3) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+        }
+        if (trt_state->auxiliary_streams >= 0) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+        }
 #endif
-          // limit used tactic sources
-          if (trt_state->filter_tactic_sources) {
-            nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-            tactics |= trt_state->tactic_sources;
-            trt_config->setTacticSources(tactics);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        // limit used tactic sources
+        if (trt_state->filter_tactic_sources) {
+          nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+          tactics |= trt_state->tactic_sources;
+          trt_config->setTacticSources(tactics);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+        }
+
+        // Load timing cache from file. Create a fresh cache if the file doesn't exist
+        std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+        if (trt_state->timing_cache_enable) {
+          std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+          timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+          if (timing_cache == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not create timing cache: " + timing_cache_path);
           }
-
-          // Load timing cache from file. Create a fresh cache if the file doesn't exist
-          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-          if (trt_state->timing_cache_enable) {
-            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
-            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
-            if (timing_cache == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
-            }
-            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
-            }
+          trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
           }
+        }
 
-          // Build engine
-          {
-            auto lock = GetApiLock();
-            std::chrono::steady_clock::time_point engine_build_start;
-            if (detailed_build_log_) {
-              engine_build_start = std::chrono::steady_clock::now();
-            }
-            *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-                trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
-            if (detailed_build_log_) {
-              auto engine_build_stop = std::chrono::steady_clock::now();
-              LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-            }
-          }
-          if (*(trt_state->engine) == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        // Build engine
+        {
+          auto lock = GetApiLock();
+          std::chrono::steady_clock::time_point engine_build_start;
+          if (detailed_build_log_) {
+            engine_build_start = std::chrono::steady_clock::now();
           }
-          trt_engine = trt_state->engine->get();
-          if (trt_state->engine_cache_enable) {
-            // Serialize engine profile
-            SerializeProfileV2(profile_cache_path, shape_ranges);
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
-
-            // Serialize engine
-            std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
-            size_t engine_size = serializedModel->size();
-            if (trt_state->engine_decryption_enable) {
-              // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-              if (trt_state->engine_encryption != nullptr) {
-                if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
-                  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "TensorRT EP could not call engine encryption function encrypt");
-                }
-                LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
-              } else {
-                LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
+          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+              trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
+          if (detailed_build_log_) {
+            auto engine_build_stop = std::chrono::steady_clock::now();
+            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+          }
+        }
+        if (*(trt_state->engine) == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        }
+        trt_engine = trt_state->engine->get();
+        if (trt_state->engine_cache_enable) {
+          // Serialize engine profile
+          SerializeProfileV2(profile_cache_path, shape_ranges);
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+
+          // Serialize engine
+          std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
+          size_t engine_size = serializedModel->size();
+          if (trt_state->engine_decryption_enable) {
+            // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
+            if (trt_state->engine_encryption != nullptr) {
+              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serializedModel->data()), engine_size)) {
+                return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                       "TensorRT EP could not call engine encryption function encrypt");
               }
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
             } else {
-              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+              LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
             }
+          } else {
+            std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
+            file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
           }
+        }
 
-          // serialize and save timing cache
-          if (trt_state->timing_cache_enable) {
-            auto timing_cache = trt_config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
-            if (timingCacheHostData == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
-            }
-            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
-            }
+        // serialize and save timing cache
+        if (trt_state->timing_cache_enable) {
+          auto timing_cache = trt_config->getTimingCache();
+          std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+          if (timingCacheHostData == nullptr) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+          }
+          saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+          if (detailed_build_log_) {
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
           }
-          context_update = true;
         }
+        context_update = true;
       }
 
       // Build execution context if either of the following conditions is true:
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index 8e588aad15f4c..5adaf80543279 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -18,6 +18,10 @@ class BinaryOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -50,6 +54,24 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
+bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+                                        const Node& node,
+                                        const WebnnDeviceType device_type,
+                                        const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& op_type = node.OpType();
+
+  // XNNPACK prelu operator expects slope to be a static value.
+  // https://github.com/google/XNNPACK/issues/4692
+  // TODO: Remove this check after it is solved.
+  if (op_type == "PRelu" && !Contains(initializers, input_defs[1]->Name()) && device_type == WebnnDeviceType::CPU) {
+    LOGS(logger, VERBOSE) << "The second input (slope) for PRelu must be a constant initializer for WebNN CPU backend.";
+    return false;
+  }
+
+  return true;
+}
+
 void CreateBinaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   if (op_registrations.op_builder_map.count(op_type) > 0)
     return;
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 10c8a2de7c3df..f470e9f6b6ed1 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -26,7 +26,18 @@
 #include "core/framework/provider_options_utils.h"
 
 #ifdef USE_DML
-#include "core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h"
+using Microsoft::WRL::ComPtr;
+
+#include <wil/wrl.h>
+#include "core/providers/dml/DmlExecutionProvider/src/External/D3DX12/d3dx12.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ErrorHandling.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DescriptorPool.h"
+#include "core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h"
+#include "core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.h"
+#include "core/providers/dml/DmlExecutionProvider/src/PooledUploadHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/ReadbackHeap.h"
+#include "core/providers/dml/DmlExecutionProvider/src/AllocationInfo.h"
 #endif
 
 namespace onnxruntime {
@@ -186,6 +197,11 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer() {
 #endif
 
 #ifdef USE_DML
+
+constexpr GUID execution_context_guid = {0x50fd773b, 0x4462, 0x4b28, {0x98, 0x9e, 0x8c, 0xa0, 0x54, 0x05, 0xbd, 0x4a}};
+constexpr GUID upload_heap_guid = {0x125235f9, 0xef41, 0x4043, {0xa4, 0x9d, 0xdd, 0xc9, 0x61, 0xe7, 0xdb, 0xee}};
+constexpr GUID dml_readback_heap_guid = {0x00d32df8, 0xea2d, 0x40bf, {0xa4, 0x47, 0x9c, 0xb4, 0xbc, 0xf1, 0x1d, 0x5e}};
+
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
   // Current approach is not thread-safe, but there are some bigger infra pieces to put together in order to make
   // multi-threaded DML allocation work, including maintaining a per-thread DML allocator.
@@ -196,13 +212,100 @@ AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id) {
 
   auto hit = id_to_allocator_map->find(id);
   if (hit == id_to_allocator_map->end()) {
-    auto dml_allocator = std::make_shared<Dml::DmlExternalBufferAllocator>(id);
+    constexpr uint32_t device_id = 0;
+    auto d3d12_device = onnxruntime::DMLProviderFactoryCreator::CreateD3D12Device(device_id, false);
+    auto dml_device = onnxruntime::DMLProviderFactoryCreator::CreateDMLDevice(d3d12_device.Get());
+
+    D3D12_COMMAND_QUEUE_DESC cmd_queue_desc = {};
+    cmd_queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    cmd_queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT;
+
+    ComPtr<ID3D12CommandQueue> cmd_queue;
+    ORT_THROW_IF_FAILED(
+        d3d12_device->CreateCommandQueue(&cmd_queue_desc, IID_PPV_ARGS(cmd_queue.ReleaseAndGetAddressOf())));
+
+    auto context = std::make_shared<Dml::ExecutionContext>(d3d12_device.Get(), dml_device.Get(), cmd_queue.Get());
+
+    // We leak the upload and readback heaps to keep them alive, just like the map
+    auto upload_heap = std::make_unique<Dml::PooledUploadHeap>(d3d12_device.Get(), context).release();
+    auto readback_heap = std::make_unique<Dml::ReadbackHeap>(d3d12_device.Get(), context).release();
+
+    auto dml_allocator = std::make_shared<Dml::BucketizedBufferAllocator>(
+        d3d12_device.Get(),
+        context,
+        CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+        D3D12_HEAP_FLAG_NONE,
+        D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS,
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+        std::make_unique<Dml::DmlCommittedResourceAllocator>(d3d12_device.Get()));
+    dml_allocator->SetDefaultRoundingMode(AllocatorRoundingMode::Enabled);
+    context->SetAllocator(dml_allocator);
+
+    auto context_ptr = context.get();
+
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(execution_context_guid, sizeof(context_ptr), &context_ptr));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(upload_heap_guid, sizeof(upload_heap), &upload_heap));
+    ORT_THROW_IF_FAILED(d3d12_device->SetPrivateData(dml_readback_heap_guid, sizeof(readback_heap), &readback_heap));
+
     hit = id_to_allocator_map->emplace(id, std::move(dml_allocator)).first;
   }
 
   return hit->second;
 }
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(dst);
+  ID3D12Resource* dst_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(dst_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::PooledUploadHeap* upload_heap = nullptr;
+  uint32_t upload_heap_size = gsl::narrow_cast<uint32_t>(sizeof(upload_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(upload_heap_guid, &upload_heap_size, &upload_heap));
+
+  upload_heap->BeginUploadToGpu(
+      dst_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, gsl::make_span(static_cast<const std::byte*>(src), num_bytes));
+  context->Flush();
+
+  // We don't use the same command queue as the execution provider, so we need to sync to make sure that all data has
+  // been uploaded to the resource. This function is usually called before inference just to upload initial data to the
+  // GPU, so it shouldn't be a bottleneck.
+  context->GetCurrentCompletionEvent().WaitForSignal();
+}
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
+  const auto* allocInfo = static_cast<const Dml::AllocationInfo*>(src);
+  ID3D12Resource* src_data = allocInfo->GetResource();
+
+  ComPtr<ID3D12Device> d3d12_device;
+  ORT_THROW_IF_FAILED(src_data->GetDevice(IID_PPV_ARGS(d3d12_device.ReleaseAndGetAddressOf())));
+
+  Dml::ExecutionContext* context = nullptr;
+  uint32_t context_size = gsl::narrow_cast<uint32_t>(sizeof(context));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(execution_context_guid, &context_size, &context));
+
+  Dml::ReadbackHeap* readback_heap = nullptr;
+  uint32_t readback_heap_size = gsl::narrow_cast<uint32_t>(sizeof(readback_heap));
+  ORT_THROW_IF_FAILED(d3d12_device->GetPrivateData(dml_readback_heap_guid, &readback_heap_size, &readback_heap));
+
+  // ReadbackFromGpu already syncs with the CPU and waits for the copy to be completed, so we don't need to sync after
+  // this call
+  readback_heap->ReadbackFromGpu(
+      gsl::make_span(static_cast<std::byte*>(dst), num_bytes), src_data, 0, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+}
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction() {
+  static std::unordered_map<OrtDevice::DeviceType, MemCpyFunc> map{
+      {OrtDevice::GPU, DmlToCpuMemCpy}};
+
+  return &map;
+}
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 4ac9c70468b19..e3f277bcb9c41 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -77,6 +77,12 @@ std::unique_ptr<IDataTransfer> GetGPUDataTransfer();
 
 AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id);
 
+void CpuToDmlMemCpy(void* dst, const void* src, size_t num_bytes);
+
+void DmlToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
+
+const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetDmlToHostMemCpyFunction();
+
 #endif
 
 #ifdef USE_CANN
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index f9d908e0ac518..dc4a4dcc13b7f 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -63,7 +63,12 @@ void addOrtValueMethods(pybind11::module& m) {
       // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
       // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in CUDA
       CreateGenericMLValue(nullptr, GetRocmAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToRocmMemCpy);
-
+#elif USE_DML
+      // InputDeflist is null because OrtValue creation is not tied to a specific model
+      // Likewise, there is no need to specify the name (as the name was previously used to lookup the def list)
+      // TODO: Add check to ensure that string arrays are not passed - we currently don't support string tensors in DML
+      CreateGenericMLValue(
+        nullptr, GetDmlAllocator(device.Id()), "", array_on_cpu, ml_value.get(), true, false, CpuToDmlMemCpy);
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
@@ -126,6 +131,12 @@ void addOrtValueMethods(pybind11::module& m) {
             values_type,
             *(ml_value->GetMutable<Tensor>()),
             CpuToRocmMemCpy);
+#elif USE_DML
+          onnxruntime::python::CopyDataToTensor(
+            py_values,
+            values_type,
+            *(ml_value->GetMutable<Tensor>()),
+            CpuToDmlMemCpy);
 #else
         throw std::runtime_error(
             "Unsupported GPU device: Cannot find the supported GPU device.");
@@ -158,12 +169,18 @@ void addOrtValueMethods(pybind11::module& m) {
             throw std::runtime_error("The provided device id doesn't match any available GPUs on the machine.");
           }
           allocator = GetCudaAllocator(device.Id());
-#elif USE_DML
-          allocator = GetDmlAllocator(device.Id());
 #else
       throw std::runtime_error(
           "Can't allocate memory on the CUDA device using this package of OnnxRuntime. "
           "Please use the CUDA package of OnnxRuntime to use this feature.");
+#endif
+        } else if (strcmp(GetDeviceName(device), DML) == 0) {
+#if USE_DML
+          allocator = GetDmlAllocator(device.Id());
+#else
+          throw std::runtime_error(
+              "Can't allocate memory on the DirectML device using this package of OnnxRuntime. "
+              "Please use the DirectML package of OnnxRuntime to use this feature.");
 #endif
         } else {
           throw std::runtime_error("Unsupported device: Cannot place the OrtValue on this device");
@@ -290,11 +307,13 @@ void addOrtValueMethods(pybind11::module& m) {
 #ifdef USE_CUDA
         GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCudaToHostMemCpyFunction());
 #elif USE_ROCM
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
 #elif USE_CANN
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
+#elif USE_DML
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetDmlToHostMemCpyFunction());
 #else
-  GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
+        GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
 #endif
         return obj;
       })
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 82d119894a5d8..907ea0ec41e23 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -237,7 +237,11 @@ const char* GetDeviceName(const OrtDevice& device) {
     case OrtDevice::CPU:
       return CPU;
     case OrtDevice::GPU:
+#ifdef USE_DML
+      return DML;
+#else
       return CUDA;
+#endif
     case OrtDevice::FPGA:
       return "FPGA";
     case OrtDevice::NPU:
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 74b81fc7c867f..43c31e1ea45ac 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -33,19 +33,20 @@
     "\n",
     "#### GPU Environment Setup using AnaConda\n",
     "\n",
-    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n",
+    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 2.0.1 and OnnxRuntime 1.16.0.\n",
     "\n",
     "```console\n",
-    "conda create -n gpu_env python=3.6\n",
+    "conda create -n gpu_env python=3.10\n",
     "conda activate gpu_env\n",
-    "conda install -c anaconda ipykernel\n",
+    "pip install jupyterlab\n",
+    "conda install ipykernel\n",
     "conda install -c conda-forge ipywidgets\n",
-    "python -m ipykernel install --user --name=gpu_env\n",
-    "jupyter notebook\n",
+    "ipython kernel install --user --name gpu_env\n",
+    "jupyter-lab\n",
     "```\n",
     "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n",
     "\n",
-    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/install/). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
+    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
    ]
   },
   {
@@ -56,18 +57,19 @@
    "source": [
     "import sys\n",
     "\n",
-    "run_install = False # Only need install once\n",
-    "if run_install:\n",
-    "    if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
-    "        !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "    else: # Mac\n",
-    "        print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n",
-    "\n",
-    "    !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
-    "\n",
-    "    # Install other packages used in this notebook.\n",
-    "    !{sys.executable} -m pip install transformers==4.8.2\n",
-    "    !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy"
+    "if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+    "    !{sys.executable} -m pip install torch --index-url https://download.pytorch.org/whl/cu118 -q\n",
+    "    !{sys.executable} -m pip install onnxruntime-gpu onnx transformers psutil pandas py-cpuinfo py3nvml coloredlogs wget netron sympy protobuf==3.20.3 -q\n",
+    "else: # Mac\n",
+    "    print(\"CUDA is not available on MacOS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CUDA and cuDNN Path\n",
+    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements) If you import torch before onnxruntime, onnxruntime might use the CUDA and cuDNN DLLs that loaded by PyTorch."
    ]
   },
   {
@@ -79,10 +81,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "pytorch: 1.9.0+cu111\n",
-      "onnxruntime: 1.8.1\n",
-      "onnx: 1.9.0\n",
-      "transformers: 4.8.2\n"
+      "pytorch: 2.0.1+cu118\n",
+      "onnxruntime: 1.16.0\n",
+      "onnx: 1.14.1\n",
+      "transformers: 4.33.1\n"
      ]
     }
    ],
@@ -191,9 +193,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n",
-      "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n",
-      "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n"
+      "Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
+      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:02<00:00, 16.27it/s]\n",
+      "convert squad examples to features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 256.11it/s]\n",
+      "add example index and unique id: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<?, ?it/s]\n"
      ]
     }
    ],
@@ -242,24 +247,20 @@
    "execution_count": 7,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/disk/conda3/envs/gpu_env/lib/python3.6/site-packages/transformers/modeling_utils.py:1974: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Model exported at  ./onnx/bert-base-cased-squad_opset11.onnx\n"
+      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n",
+      "Model exported at  .\\onnx_models\\bert-base-cased-squad_opset11.onnx\n"
      ]
     }
    ],
    "source": [
-    "output_dir = \"./onnx\"\n",
+    "output_dir = os.path.join(\".\", \"onnx_models\")\n",
     "if not os.path.exists(output_dir):\n",
     "    os.makedirs(output_dir)   \n",
     "export_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opset{}.onnx'.format(opset_version))\n",
@@ -318,7 +319,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch cuda Inference time = 16.56 ms\n"
+      "PyTorch cuda Inference time = 19.32 ms\n"
      ]
     }
    ],
@@ -345,33 +346,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4. Inference ONNX Model with ONNX Runtime ##\n",
-    "\n",
-    "### CUDA and cuDNN Path\n",
-    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
-    "add_cuda_path = False\n",
-    "\n",
-    "# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup\n",
-    "# Below is example for Windows\n",
-    "if add_cuda_path:\n",
-    "    cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
-    "    cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
-    "    if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
-    "        raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
-    "    else:\n",
-    "        if cuda_dir == cudnn_dir:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
-    "        else:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
+    "## 4. Inference ONNX Model with ONNX Runtime ##"
    ]
   },
   {
@@ -383,14 +358,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OnnxRuntime gpu Inference time = 25.28 ms\n"
+      "OnnxRuntime gpu Inference time = 6.91 ms\n"
      ]
     }
    ],
@@ -411,7 +386,7 @@
     "# Please change the value according to best setting in Performance Test Tool result.\n",
     "sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n",
     "\n",
-    "session = onnxruntime.InferenceSession(export_model_path, sess_options)\n",
+    "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=[\"CUDAExecutionProvider\", \"CPUExecutionProvider\"])\n",
     "\n",
     "latency = []\n",
     "for i in range(total_samples):\n",
@@ -437,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {
     "scrolled": true
    },
@@ -448,9 +423,9 @@
      "text": [
       "***** Verifying correctness *****\n",
       "PyTorch and ONNX Runtime output 0 are close: True\n",
-      "maximum_diff=5.7220458984375e-06 average_diff=1.3103708624839783e-06\n",
+      "maximum_diff=0.002086162567138672 average_diff=0.00040457770228385925\n",
       "PyTorch and ONNX Runtime output 1 are close: True\n",
-      "maximum_diff=5.7220458984375e-06 average_diff=1.2257369235157967e-06\n"
+      "maximum_diff=0.0033638477325439453 average_diff=0.00045418128138408065\n"
      ]
     }
    ],
@@ -476,7 +451,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {
     "scrolled": true
    },
@@ -512,7 +487,7 @@
        "          0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -531,7 +506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -539,7 +514,7 @@
      "output_type": "stream",
      "text": [
       "Average length 94\n",
-      "OnnxRuntime gpu Inference time with actual sequence length = 21.93 ms\n"
+      "OnnxRuntime gpu Inference time with actual sequence length = 6.47 ms\n"
      ]
     }
    ],
@@ -576,7 +551,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -584,8 +559,8 @@
      "output_type": "stream",
      "text": [
       "***** Comparing results with/without paddings *****\n",
-      "Output 0 are close: True\n",
-      "Output 1 are close: True\n"
+      "Output 0 are close: False\n",
+      "Output 1 are close: False\n"
      ]
     }
    ],
@@ -643,29 +618,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "               apply: Fused LayerNormalization count: 49\n",
-      "               apply: Fused Gelu count: 24\n",
-      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
-      "               apply: Fused SkipLayerNormalization count: 48\n",
-      "               apply: Fused Attention count: 24\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
-      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 24\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
+      "               apply: Fused LayerNormalization: 49\n",
+      "               apply: Fused Gelu: 24\n",
+      "               apply: Fused SkipLayerNormalization: 48\n",
+      "               apply: Fused Attention: 24\n",
+      "         prune_graph: Removed 5 nodes\n",
+      "               apply: Fused EmbedLayerNormalization(with mask): 1\n",
+      "         prune_graph: Removed 10 nodes\n",
+      "               apply: Fused BiasGelu: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias): 48\n",
       "            optimize: opset version: 11\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 48, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}\n",
+      "                main: The model has been fully optimized.\n",
       "  save_model_to_file: Sort graphs in topological order\n",
-      "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
-      "                main: The model has been fully optimized.\n"
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n"
      ]
     }
    ],
@@ -690,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -718,29 +691,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=32,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.15 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=24,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.73 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=15,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.51 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=14,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.49 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=13,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.24 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.20 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 310.02 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.22 ms, Throughput = 310.93 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.57 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.20 ms, Throughput = 312.80 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 309.53 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 309.27 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.10 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.23 ms, Throughput = 310.02 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.22 ms, Throughput = 310.30 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.21 ms, Throughput = 311.10 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.19 ms, Throughput = 313.24 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=None, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 23.72 ms, Throughput = 42.15 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.24 ms, Throughput = 41.25 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.36 ms, Throughput = 41.05 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 24.39 ms, Throughput = 41.01 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1_S128_20230912-125746.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
+    "GPU_OPTION = '--use_gpu --use_io_binding' if use_gpu else ''\n",
     "\n",
     "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
@@ -749,19 +748,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's load the summary file and take a look. Note that blank value in OMP_NUM_THREADS or OMP_WAIT_POLICY means the environment variable does not exist."
+    "Let's load the summary file and take a look."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1_S128_20230912-125746.txt\n"
      ]
     },
     {
@@ -798,89 +797,262 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>23.72</td>\n",
-       "      <td>23.72</td>\n",
-       "      <td>23.87</td>\n",
-       "      <td>23.99</td>\n",
-       "      <td>24.11</td>\n",
-       "      <td>24.37</td>\n",
-       "      <td>42.15</td>\n",
-       "      <td>4</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.16</td>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>313.24</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>24.24</td>\n",
-       "      <td>24.24</td>\n",
-       "      <td>24.42</td>\n",
-       "      <td>24.60</td>\n",
-       "      <td>24.76</td>\n",
-       "      <td>25.23</td>\n",
-       "      <td>41.25</td>\n",
-       "      <td>3</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.17</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.50</td>\n",
+       "      <td>312.80</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>24.36</td>\n",
-       "      <td>24.36</td>\n",
-       "      <td>24.47</td>\n",
-       "      <td>24.69</td>\n",
-       "      <td>25.01</td>\n",
-       "      <td>26.52</td>\n",
-       "      <td>41.05</td>\n",
-       "      <td>2</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.15</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.58</td>\n",
+       "      <td>312.51</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>24.39</td>\n",
-       "      <td>24.37</td>\n",
-       "      <td>24.47</td>\n",
-       "      <td>24.65</td>\n",
-       "      <td>24.73</td>\n",
-       "      <td>25.12</td>\n",
-       "      <td>41.01</td>\n",
-       "      <td>1</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.53</td>\n",
+       "      <td>312.49</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.16</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.40</td>\n",
+       "      <td>3.56</td>\n",
+       "      <td>312.24</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.48</td>\n",
+       "      <td>312.20</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.37</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>311.73</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>311.57</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.18</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>311.15</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.17</td>\n",
+       "      <td>3.24</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.34</td>\n",
+       "      <td>3.52</td>\n",
+       "      <td>311.10</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>3.21</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.33</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>311.10</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>310.93</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.40</td>\n",
+       "      <td>3.55</td>\n",
+       "      <td>310.30</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.32</td>\n",
+       "      <td>3.42</td>\n",
+       "      <td>3.58</td>\n",
+       "      <td>310.02</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.30</td>\n",
+       "      <td>3.36</td>\n",
+       "      <td>3.54</td>\n",
+       "      <td>310.02</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.20</td>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.27</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.60</td>\n",
+       "      <td>309.53</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>3.23</td>\n",
+       "      <td>3.19</td>\n",
+       "      <td>3.22</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.33</td>\n",
+       "      <td>3.68</td>\n",
+       "      <td>309.27</td>\n",
+       "      <td>6</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0        23.72        23.72        23.87        23.99        24.11   \n",
-       "1        24.24        24.24        24.42        24.60        24.76   \n",
-       "2        24.36        24.36        24.47        24.69        25.01   \n",
-       "3        24.39        24.37        24.47        24.65        24.73   \n",
+       "    Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0          3.19         3.16         3.21         3.27         3.35   \n",
+       "1          3.20         3.17         3.22         3.25         3.34   \n",
+       "2          3.20         3.15         3.25         3.29         3.36   \n",
+       "3          3.20         3.18         3.21         3.26         3.35   \n",
+       "4          3.20         3.16         3.25         3.29         3.40   \n",
+       "5          3.20         3.19         3.22         3.27         3.35   \n",
+       "6          3.21         3.18         3.23         3.28         3.37   \n",
+       "7          3.21         3.19         3.23         3.27         3.34   \n",
+       "8          3.21         3.18         3.26         3.31         3.36   \n",
+       "9          3.21         3.17         3.24         3.28         3.34   \n",
+       "10         3.21         3.19         3.25         3.29         3.33   \n",
+       "11         3.22         3.19         3.25         3.29         3.36   \n",
+       "12         3.22         3.19         3.26         3.29         3.40   \n",
+       "13         3.23         3.19         3.26         3.32         3.42   \n",
+       "14         3.23         3.19         3.26         3.30         3.36   \n",
+       "15         3.23         3.20         3.23         3.27         3.35   \n",
+       "16         3.23         3.19         3.22         3.26         3.33   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
-       "0        24.37            42.15                     4  \n",
-       "1        25.23            41.25                     3  \n",
-       "2        26.52            41.05                     2  \n",
-       "3        25.12            41.01                     1  "
+       "    Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0          3.52           313.24                     1  \n",
+       "1          3.50           312.80                     8  \n",
+       "2          3.58           312.51                    15  \n",
+       "3          3.53           312.49                    14  \n",
+       "4          3.56           312.24                    13  \n",
+       "5          3.48           312.20                    12  \n",
+       "6          3.51           311.73                    24  \n",
+       "7          3.52           311.57                     9  \n",
+       "8          3.54           311.15                    32  \n",
+       "9          3.52           311.10                     5  \n",
+       "10         3.54           311.10                     2  \n",
+       "11         3.51           310.93                    10  \n",
+       "12         3.55           310.30                     3  \n",
+       "13         3.58           310.02                    11  \n",
+       "14         3.54           310.02                     4  \n",
+       "15         3.60           309.53                     7  \n",
+       "16         3.68           309.27                     6  "
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float32 model perf results from\", latest_result_file)\n",
-    "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "def load_last_perf_test_result():\n",
+    "    import os\n",
+    "    import glob     \n",
+    "    import pandas\n",
+    "    latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
+    "    result_data = pandas.read_table(latest_result_file)\n",
+    "    print(\"Perf results from\", latest_result_file)\n",
+    "    # Do not show columns that have same values for all rows.\n",
+    "    columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'use_io_binding', 'average_sequence_length', 'random_sequence_length']\n",
+    "    result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
+    "    return result_data\n",
+    "    \n",
+    "thread_results = load_last_perf_test_result()\n",
+    "thread_results"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "From above result, we can see that latency is very close for different settings. The default setting (intra_op_num_threads=0, OMP_NUM_THREADS and OMP_WAIT_POLICY does not exist) performs the best. \n",
+    "From above result, we can see that latency is very close for different settings of intra_op_num_threads.\n",
     "\n",
     "### Model Results Comparison Tool\n",
     "\n",
@@ -891,21 +1063,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n",
-      "maximum absolute difference=5.316734313964844e-05\r\n",
-      "maximum relative difference=0.00012461667938623577\r\n"
+      "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\n",
+      "maximum absolute difference=0.05149984359741211\n"
      ]
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
+    "USE_GPU = '--use_gpu' if use_gpu else ''\n",
+    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $USE_GPU"
    ]
   },
   {
@@ -916,80 +1088,106 @@
     "\n",
     "The optimizer.py script have an option **--float16** to convert model to use float16 to store weights. After the conversion, it could be faster to run in GPU with tensor cores like V100 or T4.\n",
     "\n",
-    "Let's run tools to measure the performance on V100. The results show significant performance improvement: latency is about 3.4 ms for float32 model, and 1.8 ms for float16 model."
+    "Let's run tools to measure the performance on Nvidia RTX 4090. The results show significant performance improvement: latency is about 3.2 ms for float32 model, and about 1.8 ms for float16 model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "               apply: Fused LayerNormalization count: 49\n",
-      "               apply: Fused Gelu count: 24\n",
-      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
-      "               apply: Fused SkipLayerNormalization count: 48\n",
-      "               apply: Fused Attention count: 24\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
-      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 24\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
+      "\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00000\u00008\u00002\u00002\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00002\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000S\u0000o\u0000m\u0000e\u0000 \u0000n\u0000o\u0000d\u0000e\u0000s\u0000 \u0000w\u0000e\u0000r\u0000e\u0000 \u0000n\u0000o\u0000t\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000 \u0000t\u0000o\u0000 \u0000t\u0000h\u0000e\u0000 \u0000p\u0000r\u0000e\u0000f\u0000e\u0000r\u0000r\u0000e\u0000d\u0000 \u0000e\u0000x\u0000e\u0000c\u0000u\u0000t\u0000i\u0000o\u0000n\u0000 \u0000p\u0000r\u0000o\u0000v\u0000i\u0000d\u0000e\u0000r\u0000s\u0000 \u0000w\u0000h\u0000i\u0000c\u0000h\u0000 \u0000m\u0000a\u0000y\u0000 \u0000o\u0000r\u0000 \u0000m\u0000a\u0000y\u0000 \u0000n\u0000o\u0000t\u0000 \u0000h\u0000a\u0000v\u0000e\u0000 \u0000a\u0000n\u0000 \u0000n\u0000e\u0000g\u0000a\u0000t\u0000i\u0000v\u0000e\u0000 \u0000i\u0000m\u0000p\u0000a\u0000c\u0000t\u0000 \u0000o\u0000n\u0000 \u0000p\u0000e\u0000r\u0000f\u0000o\u0000r\u0000m\u0000a\u0000n\u0000c\u0000e\u0000.\u0000 \u0000e\u0000.\u0000g\u0000.\u0000 \u0000O\u0000R\u0000T\u0000 \u0000e\u0000x\u0000p\u0000l\u0000i\u0000c\u0000i\u0000t\u0000l\u0000y\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000s\u0000 \u0000s\u0000h\u0000a\u0000p\u0000e\u0000 \u0000r\u0000e\u0000l\u0000a\u0000t\u0000e\u0000d\u0000 \u0000o\u0000p\u0000s\u0000 \u0000t\u0000o\u0000 \u0000C\u0000P\u0000U\u0000 \u0000t\u0000o\u0000 \u0000i\u0000m\u0000p\u0000r\u0000o\u0000v\u0000e\u0000 \u0000p\u0000e\u0000r\u0000f\u0000.\u0000\u001b\u0000[\u0000m\u0000\n",
+      "\u0000\u001b\u0000[\u00000\u0000;\u00009\u00003\u0000m\u00002\u00000\u00002\u00003\u0000-\u00000\u00009\u0000-\u00001\u00002\u0000 \u00001\u00002\u0000:\u00005\u00007\u0000:\u00005\u00004\u0000.\u00005\u00005\u00001\u00001\u00000\u00000\u00008\u0000 \u0000[\u0000W\u0000:\u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000,\u0000 \u0000s\u0000e\u0000s\u0000s\u0000i\u0000o\u0000n\u0000_\u0000s\u0000t\u0000a\u0000t\u0000e\u0000.\u0000c\u0000c\u0000:\u00001\u00001\u00006\u00004\u0000 \u0000o\u0000n\u0000n\u0000x\u0000r\u0000u\u0000n\u0000t\u0000i\u0000m\u0000e\u0000:\u0000:\u0000V\u0000e\u0000r\u0000i\u0000f\u0000y\u0000E\u0000a\u0000c\u0000h\u0000N\u0000o\u0000d\u0000e\u0000I\u0000s\u0000A\u0000s\u0000s\u0000i\u0000g\u0000n\u0000e\u0000d\u0000T\u0000o\u0000A\u0000n\u0000E\u0000p\u0000]\u0000 \u0000R\u0000e\u0000r\u0000u\u0000n\u0000n\u0000i\u0000n\u0000g\u0000 \u0000w\u0000i\u0000t\u0000h\u0000 \u0000v\u0000e\u0000r\u0000b\u0000o\u0000s\u0000e\u0000 \u0000o\u0000u\u0000t\u0000p\u0000u\u0000t\u0000 \u0000o\u0000n\u0000 \u0000a\u0000 \u0000n\u0000o\u0000n\u0000-\u0000m\u0000i\u0000n\u0000i\u0000m\u0000a\u0000l\u0000 \u0000b\u0000u\u0000i\u0000l\u0000d\u0000 \u0000w\u0000i\u0000l\u0000l\u0000 \u0000s\u0000h\u0000o\u0000w\u0000 \u0000n\u0000o\u0000d\u0000e\u0000 \u0000a\u0000s\u0000s\u0000i\u0000g\u0000n\u0000m\u0000e\u0000n\u0000t\u0000s\u0000.\u0000\u001b\u0000[\u0000m\u0000\n",
+      "\u0000               apply: Fused LayerNormalization: 49\n",
+      "               apply: Fused Gelu: 24\n",
+      "               apply: Fused SkipLayerNormalization: 48\n",
+      "               apply: Fused Attention: 24\n",
+      "         prune_graph: Removed 5 nodes\n",
+      "               apply: Fused EmbedLayerNormalization(with mask): 1\n",
+      "         prune_graph: Removed 10 nodes\n",
+      "               apply: Fused BiasGelu: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias): 48\n",
       "            optimize: opset version: 11\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 48, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}\n",
+      "                main: The model has been fully optimized.\n",
       "  save_model_to_file: Sort graphs in topological order\n",
-      "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
-      "                main: The model has been fully optimized.\n"
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n"
      ]
     }
    ],
    "source": [
     "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n",
-    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16 $USE_GPU"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=32,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.45 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=24,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 574.96 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=15,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 574.28 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=14,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.74 ms, Throughput = 575.17 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=13,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.76 ms, Throughput = 569.77 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.79 ms, Throughput = 559.84 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.09 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 563.97 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.70 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.50 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.38 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.75 ms, Throughput = 572.89 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.76 ms, Throughput = 568.67 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.78 ms, Throughput = 561.98 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 566.14 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.78 ms, Throughput = 563.25 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 565.09 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=None, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.78 ms, Throughput = 147.54 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.76 ms, Throughput = 147.85 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.79 ms, Throughput = 147.30 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 6.81 ms, Throughput = 146.75 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
+    "GPU_OPTION = '--use_gpu --use_io_binding' if use_gpu else ''\n",
     "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1_S128_20230912-130021.txt\n"
      ]
     },
     {
@@ -1026,82 +1224,243 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>6.76</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.90</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>7.00</td>\n",
-       "      <td>147.85</td>\n",
-       "      <td>3</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.80</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>575.17</td>\n",
+       "      <td>14</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>6.78</td>\n",
-       "      <td>6.70</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.87</td>\n",
-       "      <td>6.90</td>\n",
-       "      <td>7.63</td>\n",
-       "      <td>147.54</td>\n",
-       "      <td>4</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>2.14</td>\n",
+       "      <td>574.96</td>\n",
+       "      <td>24</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.79</td>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.89</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>7.19</td>\n",
-       "      <td>147.30</td>\n",
-       "      <td>2</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>574.28</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>6.81</td>\n",
-       "      <td>6.80</td>\n",
-       "      <td>6.89</td>\n",
-       "      <td>6.91</td>\n",
-       "      <td>6.97</td>\n",
-       "      <td>7.20</td>\n",
-       "      <td>146.75</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>2.02</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>572.89</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.14</td>\n",
+       "      <td>569.77</td>\n",
+       "      <td>13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.80</td>\n",
+       "      <td>2.08</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>568.67</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.12</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>566.45</td>\n",
+       "      <td>32</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>566.38</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>566.14</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.82</td>\n",
+       "      <td>2.07</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>566.09</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.78</td>\n",
+       "      <td>2.02</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>565.70</td>\n",
+       "      <td>9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>565.50</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.81</td>\n",
+       "      <td>2.11</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>565.09</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.85</td>\n",
+       "      <td>2.06</td>\n",
+       "      <td>2.15</td>\n",
+       "      <td>563.97</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1.78</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>563.25</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>1.78</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.88</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>561.98</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>1.79</td>\n",
+       "      <td>1.75</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.99</td>\n",
+       "      <td>2.08</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>559.84</td>\n",
+       "      <td>12</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         6.76         6.79         6.81         6.90         6.91   \n",
-       "1         6.78         6.70         6.79         6.87         6.90   \n",
-       "2         6.79         6.79         6.81         6.89         6.91   \n",
-       "3         6.81         6.80         6.89         6.91         6.97   \n",
+       "    Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0          1.74         1.72         1.72         1.75         1.80   \n",
+       "1          1.74         1.73         1.73         1.75         1.76   \n",
+       "2          1.74         1.72         1.73         1.76         1.79   \n",
+       "3          1.75         1.72         1.72         1.76         2.02   \n",
+       "4          1.76         1.74         1.74         1.76         1.81   \n",
+       "5          1.76         1.72         1.73         1.80         2.08   \n",
+       "6          1.77         1.73         1.74         1.81         2.12   \n",
+       "7          1.77         1.74         1.74         1.77         2.06   \n",
+       "8          1.77         1.73         1.74         1.81         2.10   \n",
+       "9          1.77         1.73         1.74         1.82         2.07   \n",
+       "10         1.77         1.74         1.75         1.78         2.02   \n",
+       "11         1.77         1.73         1.74         1.93         2.06   \n",
+       "12         1.77         1.73         1.74         1.81         2.11   \n",
+       "13         1.77         1.74         1.75         1.85         2.06   \n",
+       "14         1.78         1.73         1.74         1.93         2.13   \n",
+       "15         1.78         1.74         1.75         1.88         2.10   \n",
+       "16         1.79         1.75         1.76         1.99         2.08   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
-       "0         7.00           147.85                     3  \n",
-       "1         7.63           147.54                     4  \n",
-       "2         7.19           147.30                     2  \n",
-       "3         7.20           146.75                     1  "
+       "    Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0          2.17           575.17                    14  \n",
+       "1          2.14           574.96                    24  \n",
+       "2          2.16           574.28                    15  \n",
+       "3          2.15           572.89                     6  \n",
+       "4          2.14           569.77                    13  \n",
+       "5          2.15           568.67                     5  \n",
+       "6          2.19           566.45                    32  \n",
+       "7          2.17           566.38                     7  \n",
+       "8          2.18           566.14                     3  \n",
+       "9          2.17           566.09                    11  \n",
+       "10         2.13           565.70                     9  \n",
+       "11         2.16           565.50                     8  \n",
+       "12         2.20           565.09                     1  \n",
+       "13         2.15           563.97                    10  \n",
+       "14         2.19           563.25                     2  \n",
+       "15         2.19           561.98                     4  \n",
+       "16         2.16           559.84                    12  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float32 model perf results from\", latest_result_file)\n",
-    "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "fp32_result = load_last_perf_test_result()\n",
+    "fp32_result"
    ]
   },
   {
@@ -1117,59 +1476,265 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 20.41 ms, Throughput = 1567.65 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 1.73 ms, Throughput = 576.74 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 2.18 ms, Throughput = 917.92 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 3.25 ms, Throughput = 1229.91 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 5.38 ms, Throughput = 1486.89 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=128,random_sequence_length=False\n",
+      "Average latency = 9.90 ms, Throughput = 1616.79 QPS\n",
+      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=32 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 168.40 ms, Throughput = 190.02 QPS\n",
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 7.14 ms, Throughput = 140.00 QPS\n",
-      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=2 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 11.27 ms, Throughput = 177.41 QPS\n",
-      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=4 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 21.15 ms, Throughput = 189.09 QPS\n",
-      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=8 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 42.27 ms, Throughput = 189.27 QPS\n",
-      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
+      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=128, random_sequence_length=False)\n",
       "Generating 1000 samples for batch_size=16 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
-      "Average latency = 83.77 ms, Throughput = 191.01 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
+      "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n"
      ]
     }
    ],
    "source": [
-    "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "THREAD_SETTING = '--intra_op_num_threads 3'\n",
+    "THREAD_SETTING = '--intra_op_num_threads 8'\n",
     "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130248.txt\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Latency(ms)</th>\n",
+       "      <th>Latency_P50</th>\n",
+       "      <th>Latency_P75</th>\n",
+       "      <th>Latency_P90</th>\n",
+       "      <th>Latency_P95</th>\n",
+       "      <th>Latency_P99</th>\n",
+       "      <th>Throughput(QPS)</th>\n",
+       "      <th>intra_op_num_threads</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.72</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>2.04</td>\n",
+       "      <td>576.74</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.29</td>\n",
+       "      <td>2.76</td>\n",
+       "      <td>917.92</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.25</td>\n",
+       "      <td>3.26</td>\n",
+       "      <td>3.28</td>\n",
+       "      <td>3.29</td>\n",
+       "      <td>3.43</td>\n",
+       "      <td>1229.91</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>5.38</td>\n",
+       "      <td>5.38</td>\n",
+       "      <td>5.39</td>\n",
+       "      <td>5.42</td>\n",
+       "      <td>5.44</td>\n",
+       "      <td>5.60</td>\n",
+       "      <td>1486.89</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>9.90</td>\n",
+       "      <td>9.89</td>\n",
+       "      <td>9.94</td>\n",
+       "      <td>9.97</td>\n",
+       "      <td>10.00</td>\n",
+       "      <td>10.06</td>\n",
+       "      <td>1616.79</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>20.41</td>\n",
+       "      <td>20.41</td>\n",
+       "      <td>20.47</td>\n",
+       "      <td>20.52</td>\n",
+       "      <td>20.55</td>\n",
+       "      <td>20.68</td>\n",
+       "      <td>1567.65</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
+       "0         1.73         1.72         1.73         1.73         1.79   \n",
+       "1         2.18         2.16         2.16         2.18         2.29   \n",
+       "2         3.25         3.25         3.26         3.28         3.29   \n",
+       "3         5.38         5.38         5.39         5.42         5.44   \n",
+       "4         9.90         9.89         9.94         9.97        10.00   \n",
+       "5        20.41        20.41        20.47        20.52        20.55   \n",
+       "\n",
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0         2.04           576.74                     8  \n",
+       "1         2.76           917.92                     8  \n",
+       "2         3.43          1229.91                     8  \n",
+       "3         5.60          1486.89                     8  \n",
+       "4        10.06          1616.79                     8  \n",
+       "5        20.68          1567.65                     8  "
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fp16_result = load_last_perf_test_result()\n",
+    "fp16_result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Packing Mode (Effective Transformer)\n",
+    "\n",
+    "When padding ratio is high, it is helpful to use packing mode, also known as [effective transformer](https://github.com/bytedance/effective_transformer).\n",
+    "This feature requires onnxruntime-gpu verison 1.16 or later. \n",
+    "\n",
+    "In below example, average sequence length after removing paddings is 32, the sequence length with paddings is 128. We can see 3x throughput with packing mode (QPS increased from 1617 to 5652)."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "_replace_attention_with_packing_attention: Converted 24 Attention nodes to PackedAttention.\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
+      "                save: Delete the existing onnx file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n",
+      "                save: Delete the existing external data file: ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx.data\n",
+      "  save_model_to_file: Model saved to ./onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 5.66 ms, Throughput = 5652.40 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.70 ms, Throughput = 586.97 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.79 ms, Throughput = 1114.37 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 1.77 ms, Throughput = 2262.31 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 2.18 ms, Throughput = 3666.45 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16_packed.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True,use_io_binding=True,average_sequence_length=32,random_sequence_length=False\n",
+      "Average latency = 3.31 ms, Throughput = 4829.58 QPS\n",
+      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=32 sequence_length=128\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=1 sequence_length=128\n",
+      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=2 sequence_length=128\n",
+      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=4 sequence_length=128\n",
+      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=8 sequence_length=128\n",
+      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, use_io_binding=True, provider=None, intra_op_num_threads=8, seed=3, verbose=False, log_severity=2, average_sequence_length=32, random_sequence_length=False)\n",
+      "Generating 1000 samples for batch_size=16 sequence_length=128\n",
+      "Test summary is saved to onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "assert use_gpu, \"Require GPU for packing mode\"\n",
+    "packed_fp16_model_path = './onnx/bert-base-cased-squad_opt_gpu_fp16_packed.onnx'\n",
+    "!{sys.executable} -m onnxruntime.transformers.convert_to_packing_mode --input $optimized_fp16_model_path --output $packed_fp16_model_path --use_external_data_format\n",
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $packed_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --average_sequence_length 32 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
+      "Perf results from ./onnx\\perf_results_GPU_B1-2-4-8-16-32_S128_20230912-130354.txt\n"
      ]
     },
     {
@@ -1200,75 +1765,75 @@
        "      <th>Latency_P95</th>\n",
        "      <th>Latency_P99</th>\n",
        "      <th>Throughput(QPS)</th>\n",
-       "      <th>batch_size</th>\n",
+       "      <th>intra_op_num_threads</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>7.14</td>\n",
-       "      <td>7.10</td>\n",
-       "      <td>7.13</td>\n",
-       "      <td>7.25</td>\n",
-       "      <td>7.35</td>\n",
-       "      <td>10.99</td>\n",
-       "      <td>140.00</td>\n",
-       "      <td>1</td>\n",
+       "      <td>1.70</td>\n",
+       "      <td>1.63</td>\n",
+       "      <td>1.65</td>\n",
+       "      <td>2.13</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>2.32</td>\n",
+       "      <td>586.97</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>11.27</td>\n",
-       "      <td>11.23</td>\n",
-       "      <td>11.28</td>\n",
-       "      <td>11.53</td>\n",
-       "      <td>11.57</td>\n",
-       "      <td>12.05</td>\n",
-       "      <td>177.41</td>\n",
-       "      <td>2</td>\n",
+       "      <td>1.77</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>1.82</td>\n",
+       "      <td>1.93</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>2262.31</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>21.15</td>\n",
-       "      <td>21.13</td>\n",
-       "      <td>21.25</td>\n",
-       "      <td>21.44</td>\n",
-       "      <td>21.59</td>\n",
-       "      <td>22.07</td>\n",
-       "      <td>189.09</td>\n",
-       "      <td>4</td>\n",
+       "      <td>1.79</td>\n",
+       "      <td>1.73</td>\n",
+       "      <td>1.74</td>\n",
+       "      <td>2.12</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.32</td>\n",
+       "      <td>1114.37</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>42.27</td>\n",
-       "      <td>42.26</td>\n",
-       "      <td>42.68</td>\n",
-       "      <td>42.95</td>\n",
-       "      <td>43.11</td>\n",
-       "      <td>45.11</td>\n",
-       "      <td>189.27</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2.16</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>2.22</td>\n",
+       "      <td>2.30</td>\n",
+       "      <td>2.64</td>\n",
+       "      <td>3666.45</td>\n",
        "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>83.77</td>\n",
-       "      <td>83.84</td>\n",
-       "      <td>84.29</td>\n",
-       "      <td>84.94</td>\n",
-       "      <td>85.35</td>\n",
-       "      <td>86.34</td>\n",
-       "      <td>191.01</td>\n",
-       "      <td>16</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.31</td>\n",
+       "      <td>3.32</td>\n",
+       "      <td>3.35</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>3.51</td>\n",
+       "      <td>4829.58</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>168.40</td>\n",
-       "      <td>169.62</td>\n",
-       "      <td>170.78</td>\n",
-       "      <td>171.94</td>\n",
-       "      <td>172.82</td>\n",
-       "      <td>174.28</td>\n",
-       "      <td>190.02</td>\n",
-       "      <td>32</td>\n",
+       "      <td>5.66</td>\n",
+       "      <td>5.66</td>\n",
+       "      <td>5.68</td>\n",
+       "      <td>5.71</td>\n",
+       "      <td>5.74</td>\n",
+       "      <td>5.91</td>\n",
+       "      <td>5652.40</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1276,38 +1841,30 @@
       ],
       "text/plain": [
        "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         7.14         7.10         7.13         7.25         7.35   \n",
-       "1        11.27        11.23        11.28        11.53        11.57   \n",
-       "2        21.15        21.13        21.25        21.44        21.59   \n",
-       "3        42.27        42.26        42.68        42.95        43.11   \n",
-       "4        83.77        83.84        84.29        84.94        85.35   \n",
-       "5       168.40       169.62       170.78       171.94       172.82   \n",
+       "0         1.70         1.63         1.65         2.13         2.20   \n",
+       "1         1.77         1.74         1.76         1.82         1.93   \n",
+       "2         1.79         1.73         1.74         2.12         2.18   \n",
+       "3         2.18         2.16         2.17         2.22         2.30   \n",
+       "4         3.31         3.31         3.32         3.35         3.39   \n",
+       "5         5.66         5.66         5.68         5.71         5.74   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  batch_size  \n",
-       "0        10.99           140.00           1  \n",
-       "1        12.05           177.41           2  \n",
-       "2        22.07           189.09           4  \n",
-       "3        45.11           189.27           8  \n",
-       "4        86.34           191.01          16  \n",
-       "5       174.28           190.02          32  "
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0         2.32           586.97                     8  \n",
+       "1         2.17          2262.31                     8  \n",
+       "2         2.32          1114.37                     8  \n",
+       "3         2.64          3666.45                     8  \n",
+       "4         3.51          4829.58                     8  \n",
+       "5         5.91          5652.40                     8  "
       ]
      },
-     "execution_count": 24,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "import os\n",
-    "import glob     \n",
-    "import pandas\n",
-    "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file)\n",
-    "print(\"Float16 model summary from\", latest_result_file)\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n",
-    "columns_to_remove.extend(['intra_op_num_threads'])\n",
-    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
-    "result_data"
+    "packing_result = load_last_perf_test_result()\n",
+    "packing_result"
    ]
   },
   {
@@ -1327,7 +1884,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "metadata": {
     "scrolled": true
    },
@@ -1336,131 +1893,53 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\r\n",
-      "  \"gpu\": {\r\n",
-      "    \"driver_version\": \"450.51.05\",\r\n",
-      "    \"devices\": [\r\n",
-      "      {\r\n",
-      "        \"memory_total\": 15843721216,\r\n",
-      "        \"memory_available\": 9313189888,\r\n",
-      "        \"name\": \"Tesla T4\"\r\n",
-      "      }\r\n",
-      "    ]\r\n",
-      "  },\r\n",
-      "  \"cpu\": {\r\n",
-      "    \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n",
-      "    \"cores\": 4,\r\n",
-      "    \"logical_cores\": 4,\r\n",
-      "    \"hz\": [\r\n",
-      "      2445417000,\r\n",
-      "      0\r\n",
-      "    ],\r\n",
-      "    \"l2_cache\": 524288,\r\n",
-      "    \"flags\": [\r\n",
-      "      \"3dnowext\",\r\n",
-      "      \"3dnowprefetch\",\r\n",
-      "      \"abm\",\r\n",
-      "      \"adx\",\r\n",
-      "      \"aes\",\r\n",
-      "      \"apic\",\r\n",
-      "      \"arat\",\r\n",
-      "      \"avx\",\r\n",
-      "      \"avx2\",\r\n",
-      "      \"bmi1\",\r\n",
-      "      \"bmi2\",\r\n",
-      "      \"clflush\",\r\n",
-      "      \"clflushopt\",\r\n",
-      "      \"clwb\",\r\n",
-      "      \"cmov\",\r\n",
-      "      \"cmp_legacy\",\r\n",
-      "      \"cpuid\",\r\n",
-      "      \"cr8_legacy\",\r\n",
-      "      \"cx16\",\r\n",
-      "      \"cx8\",\r\n",
-      "      \"de\",\r\n",
-      "      \"extd_apicid\",\r\n",
-      "      \"f16c\",\r\n",
-      "      \"fma\",\r\n",
-      "      \"fpu\",\r\n",
-      "      \"fsgsbase\",\r\n",
-      "      \"fxsr\",\r\n",
-      "      \"fxsr_opt\",\r\n",
-      "      \"ht\",\r\n",
-      "      \"hypervisor\",\r\n",
-      "      \"lahf_lm\",\r\n",
-      "      \"lm\",\r\n",
-      "      \"mca\",\r\n",
-      "      \"mce\",\r\n",
-      "      \"misalignsse\",\r\n",
-      "      \"mmx\",\r\n",
-      "      \"mmxext\",\r\n",
-      "      \"movbe\",\r\n",
-      "      \"msr\",\r\n",
-      "      \"mtrr\",\r\n",
-      "      \"nopl\",\r\n",
-      "      \"nx\",\r\n",
-      "      \"osvw\",\r\n",
-      "      \"osxsave\",\r\n",
-      "      \"pae\",\r\n",
-      "      \"pat\",\r\n",
-      "      \"pclmulqdq\",\r\n",
-      "      \"pdpe1gb\",\r\n",
-      "      \"pge\",\r\n",
-      "      \"pni\",\r\n",
-      "      \"popcnt\",\r\n",
-      "      \"pse\",\r\n",
-      "      \"pse36\",\r\n",
-      "      \"rdpid\",\r\n",
-      "      \"rdrand\",\r\n",
-      "      \"rdrnd\",\r\n",
-      "      \"rdseed\",\r\n",
-      "      \"rdtscp\",\r\n",
-      "      \"rep_good\",\r\n",
-      "      \"sep\",\r\n",
-      "      \"sha\",\r\n",
-      "      \"sha_ni\",\r\n",
-      "      \"smap\",\r\n",
-      "      \"smep\",\r\n",
-      "      \"ssbd\",\r\n",
-      "      \"sse\",\r\n",
-      "      \"sse2\",\r\n",
-      "      \"sse4_1\",\r\n",
-      "      \"sse4_2\",\r\n",
-      "      \"sse4a\",\r\n",
-      "      \"ssse3\",\r\n",
-      "      \"syscall\",\r\n",
-      "      \"topoext\",\r\n",
-      "      \"tsc\",\r\n",
-      "      \"umip\",\r\n",
-      "      \"vme\",\r\n",
-      "      \"vmmcall\",\r\n",
-      "      \"xgetbv1\",\r\n",
-      "      \"xsave\",\r\n",
-      "      \"xsavec\",\r\n",
-      "      \"xsaveerptr\",\r\n",
-      "      \"xsaveopt\",\r\n",
-      "      \"xsaves\"\r\n",
-      "    ],\r\n",
-      "    \"processor\": \"x86_64\"\r\n",
-      "  },\r\n",
-      "  \"memory\": {\r\n",
-      "    \"total\": 29450223616,\r\n",
-      "    \"available\": 22402334720\r\n",
-      "  },\r\n",
-      "  \"python\": \"3.6.13.final.0 (64 bit)\",\r\n",
-      "  \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n",
-      "  \"onnxruntime\": {\r\n",
-      "    \"version\": \"1.8.1\",\r\n",
-      "    \"support_gpu\": true\r\n",
-      "  },\r\n",
-      "  \"onnxruntime_tools\": null,\r\n",
-      "  \"pytorch\": {\r\n",
-      "    \"version\": \"1.9.0+cu111\",\r\n",
-      "    \"support_gpu\": true,\r\n",
-      "    \"cuda\": \"11.1\"\r\n",
-      "  },\r\n",
-      "  \"tensorflow\": null\r\n",
-      "}\r\n"
+      "{\n",
+      "  \"gpu\": {\n",
+      "    \"driver_version\": \"537.13\",\n",
+      "    \"devices\": [\n",
+      "      {\n",
+      "        \"memory_total\": 25757220864,\n",
+      "        \"memory_available\": 18009264128,\n",
+      "        \"name\": \"NVIDIA GeForce RTX 4090\"\n",
+      "      }\n",
+      "    ]\n",
+      "  },\n",
+      "  \"cpu\": {\n",
+      "    \"brand\": \"13th Gen Intel(R) Core(TM) i9-13900\",\n",
+      "    \"cores\": 24,\n",
+      "    \"logical_cores\": 32,\n",
+      "    \"hz\": \"2000000000,0\",\n",
+      "    \"l2_cache\": 33554432,\n",
+      "    \"flags\": \"3dnow,3dnowprefetch,abm,acpi,adx,aes,apic,avx,avx2,bmi1,bmi2,clflush,clflushopt,clwb,cmov,cx16,cx8,de,dts,erms,est,f16c,fma,fpu,fxsr,gfni,ht,hypervisor,ia64,intel_pt,invpcid,lahf_lm,mca,mce,mmx,monitor,movbe,msr,mtrr,osxsave,pae,pat,pbe,pcid,pclmulqdq,pdcm,pge,pni,popcnt,pse,pse36,rdpid,rdrnd,rdseed,sep,serial,sha,smap,smep,ss,sse,sse2,sse4_1,sse4_2,ssse3,tm,tm2,tsc,tscdeadline,umip,vaes,vme,vpclmulqdq,x2apic,xsave,xtpr\",\n",
+      "    \"processor\": \"Intel64 Family 6 Model 183 Stepping 1, GenuineIntel\"\n",
+      "  },\n",
+      "  \"memory\": {\n",
+      "    \"total\": 33992912896,\n",
+      "    \"available\": 17272422400\n",
+      "  },\n",
+      "  \"os\": \"Windows-10-10.0.22621-SP0\",\n",
+      "  \"python\": \"3.10.13.final.0 (64 bit)\",\n",
+      "  \"packages\": {\n",
+      "    \"flatbuffers\": \"23.5.26\",\n",
+      "    \"numpy\": \"1.25.2\",\n",
+      "    \"onnx\": \"1.14.1\",\n",
+      "    \"onnxruntime-gpu\": \"1.16.0\",\n",
+      "    \"protobuf\": \"3.20.3\",\n",
+      "    \"sympy\": \"1.12\",\n",
+      "    \"torch\": \"2.0.1+cu118\",\n",
+      "    \"transformers\": \"4.33.1\"\n",
+      "  },\n",
+      "  \"onnxruntime\": {\n",
+      "    \"version\": \"1.16.0\",\n",
+      "    \"support_gpu\": true\n",
+      "  },\n",
+      "  \"pytorch\": {\n",
+      "    \"version\": \"2.0.1+cu118\",\n",
+      "    \"support_gpu\": true,\n",
+      "    \"cuda\": \"11.8\"\n",
+      "  },\n",
+      "  \"tensorflow\": null\n",
+      "}\n"
      ]
     }
    ],
@@ -1485,9 +1964,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.13"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 8c836db7b9ef6..60be2d84b2bc8 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -816,51 +816,77 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         """
 
         if len(self.graphs()) > 1:
+            # TODO(tianleiwu): handle subgraph
             logger.debug("Skip prune_graph since graph has subgraph")
             return
 
-        if outputs is None:
-            outputs = [output.name for output in self.model.graph.output]
+        keep_outputs = [output.name for output in self.model.graph.output] if outputs is None else outputs
 
         output_name_to_node = self.output_name_to_node()
-        all_nodes = []
-        for output in outputs:
-            if output in output_name_to_node:
-                last_node = output_name_to_node[output]
-                if last_node in all_nodes:
-                    continue
-                nodes = self.get_parent_subgraph_nodes(last_node, [])
-                all_nodes.append(last_node)
-                all_nodes.extend(nodes)
 
-        nodes_to_remove = [node for node in self.model.graph.node if node not in all_nodes]
+        def get_first_output(node):
+            if node.output[0]:
+                return node.output[0]
+            return next(iter([o for o in node.output if o]), None)
 
-        self.remove_nodes(nodes_to_remove)
+        # Keep track of nodes to keep. The key is first output of node, and the value is the node.
+        output_to_node = {}
 
-        # remove outputs not in list
-        output_to_remove = []
-        for output in self.model.graph.output:
-            if output.name not in outputs:
-                output_to_remove.append(output)
-        for output in output_to_remove:
-            self.model.graph.output.remove(output)
+        # Start from graph outputs, and find parent nodes recurisvely, and add nodes to the output_to_node dictionary.
+        dq = deque()
+        for output in keep_outputs:
+            if output in output_name_to_node:
+                dq.append(output_name_to_node[output])
+        while len(dq) > 0:
+            node = dq.pop()
+            first_output = get_first_output(node)
+            if first_output and (first_output not in output_to_node):
+                output_to_node[first_output] = node
+                for name in node.input:
+                    if len(name) > 0 and (name in output_name_to_node) and (name not in output_to_node):
+                        dq.appendleft(output_name_to_node[name])
+
+        # Keep only those nodes in the output_to_node dictionary.
+        nodes_to_keep = []
+        num_nodes_removed = 0
+        for node in self.model.graph.node:
+            first_output = get_first_output(node)
+            kept_node = output_to_node[first_output] if first_output in output_to_node else None
 
-        # remove inputs not used by any node.
+            # Need double check the node since fused node might reuse output name of some nodes to be removed.
+            # It is slow to compare whole node, so we compare op_type first to avoid comparing node in most cases.
+            if kept_node and kept_node.op_type == node.op_type and kept_node == node:
+                nodes_to_keep.append(node)
+            else:
+                num_nodes_removed += 1
+        self.model.graph.ClearField("node")
+        self.model.graph.node.extend(nodes_to_keep)
+
+        # Remove graph outputs not in list
+        output_to_remove = []
+        if outputs is not None:
+            for output in self.model.graph.output:
+                if output.name not in outputs:
+                    output_to_remove.append(output)
+            for output in output_to_remove:
+                self.model.graph.output.remove(output)
+
+        # Remove graph inputs not used by any node.
         input_to_remove = []
         if allow_remove_graph_inputs:
             input_name_to_nodes = self.input_name_to_nodes()
             input_to_remove = [input for input in self.model.graph.input if input.name not in input_name_to_nodes]
-            for input in input_to_remove:
-                self.model.graph.input.remove(input)
+            for name in input_to_remove:
+                self.model.graph.input.remove(name)
 
-        if input_to_remove or output_to_remove or nodes_to_remove:
+        if input_to_remove or output_to_remove or num_nodes_removed > 0:
             removed = []
             if input_to_remove:
                 removed.append(f"{len(input_to_remove)} inputs")
             if output_to_remove:
                 removed.append(f"{len(output_to_remove)} outputs")
-            if nodes_to_remove:
-                removed.append(f"{len(nodes_to_remove)} nodes")
+            if num_nodes_removed > 0:
+                removed.append(f"{num_nodes_removed} nodes")
             logger.info("Removed %s", ", ".join(removed))
 
         self.update_graph()
diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp
new file mode 100644
index 0000000000000..95961e423833a
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp
@@ -0,0 +1,346 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <unordered_map>
+
+#include "core/graph/node_attr_utils.h"
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "onnx/onnx_pb.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Returns a function that creates a graph with a single Pad operator.
+static GetTestModelFn BuildPadTestCase(const TestInputDef<float>& data_def,
+                                       const TestInputDef<int64_t>& pads_def,
+                                       const TestInputDef<float>& constant_value_def,
+                                       const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                       bool has_constant_value = true) {
+  return [data_def, pads_def, constant_value_def, attrs, has_constant_value](ModelTestBuilder& builder) {
+    NodeArg* data = MakeTestInput(builder, data_def);
+    NodeArg* pads = MakeTestInput(builder, pads_def);
+    std::vector<NodeArg*> inputs{data, pads};
+    if (has_constant_value) {
+      NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+      inputs.push_back(constant_value);
+    }
+    NodeArg* output = builder.MakeOutput();
+    Node& pad_node = builder.AddNode("Pad", inputs, {output});
+
+    for (const auto& attr : attrs) {
+      pad_node.AddAttributeProto(attr);
+    }
+  };
+}
+
+// Returns a function that creates a graph with a QDQ Pad operator.
+template <typename QuantType>
+GetTestQDQModelFn<QuantType> BuildPadQDQTestCase(const TestInputDef<float>& data_def,
+                                                 const TestInputDef<int64_t>& pads_def,
+                                                 const TestInputDef<float>& constant_value_def,
+                                                 const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                 bool has_constant_value,
+                                                 bool constant_value_quantized) {
+  return [data_def, pads_def, constant_value_def, attrs, has_constant_value, constant_value_quantized](ModelTestBuilder& builder,
+                                                                                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    std::vector<NodeArg*> inputs;
+    // data -> Q -> DQ ->
+    NodeArg* data = MakeTestInput(builder, data_def);
+    QuantParams<QuantType> data_qparams = GetTestInputQuantParams<QuantType>(data_def);
+    NodeArg* data_qdq = AddQDQNodePair<QuantType>(builder, data, data_qparams.scale, data_qparams.zero_point);
+    inputs.push_back(data_qdq);
+
+    // pads
+    NodeArg* pads = MakeTestInput(builder, pads_def);
+    inputs.push_back(pads);
+
+    // constant_value -- QNN support both quantized and non-quantized
+    if (has_constant_value) {
+      if (constant_value_quantized) {
+        // constant_value -> Q -> DQ ->
+        NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+        QuantParams<QuantType> constant_value_qparams = GetTestInputQuantParams<QuantType>(constant_value_def);
+        NodeArg* constant_value_qdq = AddQDQNodePair<QuantType>(builder, constant_value,
+                                                                constant_value_qparams.scale,
+                                                                constant_value_qparams.zero_point);
+        inputs.push_back(constant_value_qdq);
+      } else {
+        NodeArg* constant_value = MakeTestInput(builder, constant_value_def);
+        inputs.push_back(constant_value);
+      }
+    }
+
+    NodeArg* output = builder.MakeIntermediate();
+    Node& pad_node = builder.AddNode("Pad", inputs, {output});
+
+    for (const auto& attr : attrs) {
+      pad_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point);
+  };
+}
+
+// Runs an Pad model on the QNN CPU backend. Checks the graph node assignment, and that inference
+// outputs for QNN and CPU match.
+static void RunPadOpTest(const TestInputDef<float>& data_def,
+                         const TestInputDef<int64_t>& pads_def,
+                         const TestInputDef<float>& constant_value_def,
+                         const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                         ExpectedEPNodeAssignment expected_ep_assignment,
+                         bool has_constant_value = true,
+                         int opset = 18) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs, has_constant_value),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment);
+}
+
+// Runs a QDQ Pad model on the QNN HTP backend. Checks the graph node assignment, and that inference
+// outputs for QNN and CPU match.
+template <typename QuantType>
+static void RunQDQPadOpTest(const TestInputDef<float>& data_def,
+                            const TestInputDef<int64_t>& pads_def,
+                            const TestInputDef<float>& constant_value_def,
+                            const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                            ExpectedEPNodeAssignment expected_ep_assignment,
+                            bool has_constant_value = true,
+                            bool constant_value_quantized = true,
+                            int opset = 18) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  TestQDQModelAccuracy(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs),
+                       BuildPadQDQTestCase<QuantType>(data_def, pads_def, constant_value_def, attrs,
+                                                      has_constant_value, constant_value_quantized),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       1e-5f);
+}
+
+//
+// CPU tests:
+//
+
+// Pad 2d
+TEST_F(QnnCPUBackendTests, Pad2d) {
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 2d, pads input not initializer
+TEST_F(QnnCPUBackendTests, Pad2dPadsNotIni) {
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, false, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::None);
+}
+
+// Pad reflect mode
+// Expected: contains 12 values, where each value and its corresponding value in 16-byte object <0C-00 00-00 00-00 00-00 40-01 23-05 EC-01 00-00> are an almost-equal pair
+// Actual: 16-byte object <0C-00 00-00 00-00 00-00 40-01 12-05 EC-01 00-00>, where the value pair (1.2, 0) at index #1 don't match, which is -1.2 from 1.2
+TEST_F(QnnCPUBackendTests, DISABLED_PadModeReflect) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "reflect")},
+               ExpectedEPNodeAssignment::All,
+               has_constant_value);
+}
+
+// Pad edge mode
+TEST_F(QnnCPUBackendTests, PadModeEdge) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "edge")},
+               ExpectedEPNodeAssignment::All,
+               has_constant_value);
+}
+
+// Pad wrap mode not supported
+TEST_F(QnnCPUBackendTests, PadModeWrap) {
+  bool has_constant_value = false;
+  RunPadOpTest(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+               TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "wrap")},
+               ExpectedEPNodeAssignment::None,  // not supported
+               has_constant_value);
+}
+
+// Pad 4d
+TEST_F(QnnCPUBackendTests, Pad4d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2}, false,
+                                   {1.0f, 1.0f,
+                                    1.0f, 1.0f,
+                                    1.0f, 1.0f,
+                                    1.0f, 1.0f}),
+               TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 5d supported
+TEST_F(QnnCPUBackendTests, Pad5d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 16)),
+               TestInputDef<int64_t>({10}, true, {0, 0, 0, 1, 0, 0, 0, 1, 0, 0}),
+               TestInputDef<float>({1}, true, {5.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::All);
+}
+
+// Pad 6d supported
+TEST_F(QnnCPUBackendTests, Pad6d) {
+  RunPadOpTest(TestInputDef<float>({1, 2, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 32)),
+               TestInputDef<int64_t>({12}, true, {0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0}),
+               TestInputDef<float>({1}, true, {0.0f}),
+               {utils::MakeAttribute("mode", "constant")},
+               ExpectedEPNodeAssignment::None);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+// QDQ Pad
+TEST_F(QnnHTPBackendTests, PadNoConstantValue) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, PadHasConstantValueNonQuantized) {
+  bool has_constant_value_input = true;
+  bool constant_value_quantized = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input,
+                           constant_value_quantized);
+}
+
+TEST_F(QnnHTPBackendTests, PadHasConstantValueQuantized) {
+  bool has_constant_value_input = true;
+  bool constant_value_quantized = true;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input,
+                           constant_value_quantized);
+}
+
+// QNN graph execute error. Error code: 6031
+TEST_F(QnnHTPBackendTests, DISABLED_PadReflectMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "reflect")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, PadEdgeMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "edge")},
+                           ExpectedEPNodeAssignment::All,
+                           has_constant_value_input);
+}
+
+// wrap mode not supported
+TEST_F(QnnHTPBackendTests, PadWrapMode) {
+  bool has_constant_value_input = false;
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}),
+                           TestInputDef<int64_t>({4}, true, {0, 2, 0, 0}),
+                           TestInputDef<float>({1}, true, {0.0f}),
+                           {utils::MakeAttribute("mode", "wrap")},
+                           ExpectedEPNodeAssignment::None,
+                           has_constant_value_input);
+}
+
+TEST_F(QnnHTPBackendTests, Pad4d) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false,
+                                               {1.0f, 2.0f,
+                                                3.0f, 4.0f,
+                                                5.0f, 6.0f,
+                                                7.0f, 8.0f}),
+                           TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+                           TestInputDef<float>({1}, true, {5.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+// Inaccuracy detected for output 'output', element 0.
+// Output quant params: scale=0.035294119268655777, zero_point=0.
+// Expected val: 9
+// QNN QDQ val: 8.0117654800415039 (err 0.98823451995849609)
+// CPU QDQ val: 9 (err 0)
+// QNN limitation? pad_constant_value has to be within the range of input[0].
+// Here pad_constant_value = 9.0 > max(input[0]) = 8.0
+TEST_F(QnnHTPBackendTests, DISABLED_Pad4dOutOfRangePadConstantValue) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2}, false,
+                                               {1.0f, 2.0f,
+                                                3.0f, 4.0f,
+                                                5.0f, 6.0f,
+                                                7.0f, 8.0f}),
+                           TestInputDef<int64_t>({8}, true, {0, 0, 0, 1, 0, 0, 0, 1}),
+                           TestInputDef<float>({1}, true, {9.0f}),  // pad_constant_value out of input[0] range
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+// Pad 5d supported, but Quantize & Dequantize doesn't support 5d
+TEST_F(QnnHTPBackendTests, DISABLED_Pad5d) {
+  RunQDQPadOpTest<uint8_t>(TestInputDef<float>({1, 2, 2, 2, 2}, false, GetFloatDataInRange(1.0f, 10.0f, 16)),
+                           TestInputDef<int64_t>({10}, true, {0, 0, 0, 1, 0, 0, 0, 1, 0, 0}),
+                           TestInputDef<float>({1}, true, {2.0f}),
+                           {utils::MakeAttribute("mode", "constant")},
+                           ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
\ No newline at end of file
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index 8009d97ba34ce..56417f13fbea4 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -16,40 +16,43 @@
 from onnxruntime.capi._pybind_state import OrtValue as C_OrtValue
 from onnxruntime.capi._pybind_state import OrtValueVector, SessionIOBinding
 
+test_params = [
+    ("cuda", "CUDAExecutionProvider", C_OrtDevice.cuda),
+    ("dml", "DmlExecutionProvider", C_OrtDevice.dml),
+]
+
 
 class TestIOBinding(unittest.TestCase):
-    def create_ortvalue_input_on_gpu(self):
+    def _create_ortvalue_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
-            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), "cuda", 0
+            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), device, 0
         )
 
-    def create_ortvalue_alternate_input_on_gpu(self):
+    def _create_ortvalue_alternate_input_on_gpu(self, device):
         return onnxrt.OrtValue.ortvalue_from_numpy(
             np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]], dtype=np.float32),
-            "cuda",
+            device,
             0,
         )
 
-    def create_uninitialized_ortvalue_input_on_gpu(self):
-        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, "cuda", 0)
+    def _create_uninitialized_ortvalue_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, device, 0)
 
-    def create_numpy_input(self):
+    def _create_numpy_input(self):
         return np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
 
-    def create_expected_output(self):
+    def _create_expected_output(self):
         return np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
-    def create_expected_output_alternate(self):
+    def _create_expected_output_alternate(self):
         return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
 
     def test_bind_input_to_cpu_arr(self):
-        self.create_numpy_input()
-
         session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         io_binding = session.io_binding()
 
         # Bind Numpy object (input) that's on CPU to wherever the model needs it
-        io_binding.bind_cpu_input("X", self.create_numpy_input())
+        io_binding.bind_cpu_input("X", self._create_numpy_input())
 
         # Bind output to CPU
         io_binding.bind_output("Y")
@@ -57,254 +60,280 @@ def test_bind_input_to_cpu_arr(self):
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
+        # Sync if different streams
         io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
+        # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host here)
         ort_output = io_binding.copy_outputs_to_cpu()[0]
 
         # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
-    @unittest.skip("Could not find an implementation for Identity(19) node with name ''")
     def test_bind_input_types(self):
-        opset = onnx_opset_version()
-        devices = [
-            (
-                C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
-                ["CPUExecutionProvider"],
-            )
-        ]
-        if "CUDAExecutionProvider" in onnxrt.get_all_providers():
-            devices.append(
-                (
-                    C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0),
-                    ["CUDAExecutionProvider"],
-                )
-            )
-
-        for device, provider in devices:
-            for dtype in [
-                np.float32,
-                np.float64,
-                np.int32,
-                np.uint32,
-                np.int64,
-                np.uint64,
-                np.int16,
-                np.uint16,
-                np.int8,
-                np.uint8,
-                np.float16,
-                np.bool_,
-            ]:
-                with self.subTest(dtype=dtype, device=str(device)):
-                    x = np.arange(8).reshape((-1, 2)).astype(dtype)
-                    proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
-
-                    X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
-                    Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
-
-                    # inference
-                    node_add = helper.make_node("Identity", ["X"], ["Y"])
-
-                    # graph
-                    graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
-                    model_def = helper.make_model(
-                        graph_def,
-                        producer_name="dummy",
-                        ir_version=7,
-                        producer_version="0",
-                        opset_imports=[helper.make_operatorsetid("", opset)],
-                    )
-
-                    sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
-
-                    bind = SessionIOBinding(sess._sess)
-                    ort_value = C_OrtValue.ortvalue_from_numpy(x, device)
-                    bind.bind_ortvalue_input("X", ort_value)
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvaluevector = bind.get_outputs()
-                    self.assertIsInstance(ortvaluevector, OrtValueVector)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
-
-                    bind = SessionIOBinding(sess._sess)
-                    bind.bind_input("X", device, dtype, x.shape, ort_value.data_ptr())
-                    bind.bind_output("Y", device)
-                    sess._sess.run_with_iobinding(bind, None)
-                    ortvalue = bind.get_outputs()[0]
-                    y = ortvalue.numpy()
-                    assert_almost_equal(x, y)
+        for device, execution_provider, generate_device in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                opset = onnx_opset_version()
+                devices = [
+                    (
+                        C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0),
+                        ["CPUExecutionProvider"],
+                    ),
+                    (
+                        C_OrtDevice(generate_device(), C_OrtDevice.default_memory(), 0),
+                        [execution_provider],
+                    ),
+                ]
+
+                for inner_device, provider in devices:
+                    for dtype in [
+                        np.float32,
+                        np.float64,
+                        np.int32,
+                        np.uint32,
+                        np.int64,
+                        np.uint64,
+                        np.int16,
+                        np.uint16,
+                        np.int8,
+                        np.uint8,
+                        np.float16,
+                        np.bool_,
+                    ]:
+                        with self.subTest(dtype=dtype, inner_device=str(inner_device)):
+                            x = np.arange(8).reshape((-1, 2)).astype(dtype)
+                            proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
+
+                            X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
+                            Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
+
+                            # inference
+                            node_add = helper.make_node("Identity", ["X"], ["Y"])
+
+                            # graph
+                            graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
+                            model_def = helper.make_model(
+                                graph_def,
+                                producer_name="dummy",
+                                ir_version=7,
+                                producer_version="0",
+                                opset_imports=[helper.make_operatorsetid("", opset)],
+                            )
+
+                            sess = onnxrt.InferenceSession(model_def.SerializeToString(), providers=provider)
+
+                            bind = SessionIOBinding(sess._sess)
+                            ort_value = C_OrtValue.ortvalue_from_numpy(x, inner_device)
+                            bind.bind_ortvalue_input("X", ort_value)
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvaluevector = bind.get_outputs()
+                            self.assertIsInstance(ortvaluevector, OrtValueVector)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
+
+                            bind = SessionIOBinding(sess._sess)
+                            bind.bind_input("X", inner_device, dtype, x.shape, ort_value.data_ptr())
+                            bind.bind_output("Y", inner_device)
+                            sess._sess.run_with_iobinding(bind, None)
+                            ortvalue = bind.get_outputs()[0]
+                            y = ortvalue.numpy()
+                            assert_almost_equal(x, y)
 
     def test_bind_input_only(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind output to CPU
-        io_binding.bind_output("Y")
+                # Bind output to CPU
+                io_binding.bind_output("Y")
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output = io_binding.copy_outputs_to_cpu()[0]
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output = io_binding.copy_outputs_to_cpu()[0]
 
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
 
     def test_bind_input_and_preallocated_output(self):
-        input = self.create_ortvalue_input_on_gpu()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
-
-        # Bind input to CUDA
-        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
-
-        # Bind output to CUDA
-        output = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_output("Y", "cuda", 0, np.float32, [3, 2], output.data_ptr())
-
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
-
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                input = self._create_ortvalue_input_on_gpu(device)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
-        ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals))
+                # Bind input to the GPU
+                io_binding.bind_input("X", device, 0, np.float32, [3, 2], input.data_ptr())
 
-        # Validate if ORT actually wrote to pre-allocated buffer by copying the Torch allocated buffer
-        # to the host and validating its contents
-        ort_output_vals_in_cpu = output.numpy()
-        # Validate results
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_output_vals_in_cpu))
+                # Bind output to the GPU
+                output = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_output("Y", device, 0, np.float32, [3, 2], output.data_ptr())
 
-    def test_bind_input_and_non_preallocated_output(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Bind input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_input_on_gpu().data_ptr(),
-        )
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Bind output to CUDA
-        io_binding.bind_output("Y", "cuda")
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+                # here)
+                ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals))
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Validate if ORT actually wrote to pre-allocated buffer by copying the allocated buffer
+                # to the host and validating its contents
+                ort_output_vals_in_cpu = output.numpy()
+                # Validate results
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals_in_cpu))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+    def test_bind_input_and_non_preallocated_output(self):
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
+
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
+
+                input = self._create_ortvalue_input_on_gpu(device)
+
+                # Bind input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # We should be able to repeat the above process as many times as we want - try once more
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output(), ort_outputs[0].numpy()))
-
-        # Change the bound input and validate the results in the same bound OrtValue
-        # Bind alternate input to CUDA
-        io_binding.bind_input(
-            "X",
-            "cuda",
-            0,
-            np.float32,
-            [3, 2],
-            self.create_ortvalue_alternate_input_on_gpu().data_ptr(),
-        )
+                # Bind output to the GPU
+                io_binding.bind_output("Y", device)
+
+                # Sync if different streams
+                io_binding.synchronize_inputs()
+
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
+
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                # We should be able to repeat the above process as many times as we want - try once more
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+                input = self._create_ortvalue_alternate_input_on_gpu(device)
+
+                # Change the bound input and validate the results in the same bound OrtValue
+                # Bind alternate input to the GPU
+                io_binding.bind_input(
+                    "X",
+                    device,
+                    0,
+                    np.float32,
+                    [3, 2],
+                    input.data_ptr(),
+                )
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # This call returns an OrtValue which has data allocated by ORT on CUDA
-        ort_outputs = io_binding.get_outputs()
-        self.assertEqual(len(ort_outputs), 1)
-        self.assertEqual(ort_outputs[0].device_name(), "cuda")
-        # Validate results (by copying results to CPU by creating a Numpy object)
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), ort_outputs[0].numpy()))
+                # This call returns an OrtValue which has data allocated by ORT on the GPU
+                ort_outputs = io_binding.get_outputs()
+                self.assertEqual(len(ort_outputs), 1)
+                self.assertEqual(ort_outputs[0].device_name(), device)
+                # Validate results (by copying results to CPU by creating a Numpy object)
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), ort_outputs[0].numpy()))
 
     def test_bind_input_and_bind_output_with_ortvalues(self):
-        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
-        io_binding = session.io_binding()
+        for device, execution_provider, _ in test_params:
+            with self.subTest(execution_provider):
+                if execution_provider not in onnxrt.get_available_providers():
+                    self.skipTest(f"Skipping on {device.upper()}.")
 
-        # Bind ortvalue as input
-        input_ortvalue = self.create_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue)
+                session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
+                io_binding = session.io_binding()
 
-        # Bind ortvalue as output
-        output_ortvalue = self.create_uninitialized_ortvalue_input_on_gpu()
-        io_binding.bind_ortvalue_output("Y", output_ortvalue)
+                # Bind ortvalue as input
+                input_ortvalue = self._create_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue)
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind ortvalue as output
+                output_ortvalue = self._create_uninitialized_ortvalue_input_on_gpu(device)
+                io_binding.bind_ortvalue_output("Y", output_ortvalue)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output(), output_ortvalue.numpy()))
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Bind another ortvalue as input
-        input_ortvalue_2 = self.create_ortvalue_alternate_input_on_gpu()
-        io_binding.bind_ortvalue_input("X", input_ortvalue_2)
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output(), output_ortvalue.numpy()))
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_inputs()
+                # Bind another ortvalue as input
+                input_ortvalue_2 = self._create_ortvalue_alternate_input_on_gpu(device)
+                io_binding.bind_ortvalue_input("X", input_ortvalue_2)
 
-        # Invoke Run
-        session.run_with_iobinding(io_binding)
+                # Sync if different streams
+                io_binding.synchronize_inputs()
 
-        # Sync if different CUDA streams
-        io_binding.synchronize_outputs()
+                # Invoke Run
+                session.run_with_iobinding(io_binding)
+
+                # Sync if different streams
+                io_binding.synchronize_outputs()
 
-        # Inspect contents of output_ortvalue and make sure that it has the right contents
-        self.assertTrue(np.array_equal(self.create_expected_output_alternate(), output_ortvalue.numpy()))
+                # Inspect contents of output_ortvalue and make sure that it has the right contents
+                self.assertTrue(np.array_equal(self._create_expected_output_alternate(), output_ortvalue.numpy()))
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc
index 937d505015d3c..174edabbc91fe 100644
--- a/onnxruntime/wasm/api.cc
+++ b/onnxruntime/wasm/api.cc
@@ -155,6 +155,12 @@ int OrtAppendExecutionProvider(ort_session_options_handle_t session_options, con
   return CHECK_STATUS(SessionOptionsAppendExecutionProvider, session_options, name, nullptr, nullptr, 0);
 }
 
+int OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                const char* dim_param_name,
+                                int dim_value) {
+  return CHECK_STATUS(AddFreeDimensionOverrideByName, session_options, dim_param_name, dim_value);
+}
+
 int OrtAddSessionConfigEntry(OrtSessionOptions* session_options,
                              const char* config_key,
                              const char* config_value) {
diff --git a/onnxruntime/wasm/api.h b/onnxruntime/wasm/api.h
index b9103414aae67..398c901e0e5ed 100644
--- a/onnxruntime/wasm/api.h
+++ b/onnxruntime/wasm/api.h
@@ -84,6 +84,13 @@ ort_session_options_handle_t EMSCRIPTEN_KEEPALIVE OrtCreateSessionOptions(size_t
 int EMSCRIPTEN_KEEPALIVE OrtAppendExecutionProvider(ort_session_options_handle_t session_options,
                                                     const char* name);
 
+/**
+ * add a free dimension override for one dimension of a session's input.
+ */
+int EMSCRIPTEN_KEEPALIVE OrtAddFreeDimensionOverride(ort_session_options_handle_t session_options,
+                                                     const char* dim_param_name,
+                                                     int dim_value);
+
 /**
  * store configurations for a session.
  * @param session_options a handle to session options created by OrtCreateSessionOptions
diff --git a/setup.py b/setup.py
index 13731eb4e76bb..7e6ab93194b0d 100644
--- a/setup.py
+++ b/setup.py
@@ -214,7 +214,7 @@ def run(self):
                     "libhsa-runtime64.so.1",
                 ]
 
-                tensorrt_dependencies = ["libnvinfer.so.8.6", "libnvinfer_plugin.so.8.6", "libnvonnxparser.so.8.6"]
+                tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"]
 
                 dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so"
                 if path.isfile(dest):
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 48129e15934dc..c4fb5499983cb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1840,13 +1840,12 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     [sys.executable, "onnxruntime_test_python_symbolic_shape_infer.py"], cwd=cwd, dll_path=dll_path
                 )
 
-            # For CUDA enabled builds test IOBinding feature
-            if args.use_cuda:
-                # We need to have Torch installed to test the IOBinding feature
-                # which currently uses Torch's allocator to allocate GPU memory for testing
+            # For CUDA or DML enabled builds test IOBinding feature
+            if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
+            if args.use_cuda:
                 log.info("Testing CUDA Graph feature")
                 run_subprocess([sys.executable, "onnxruntime_test_python_cudagraph.py"], cwd=cwd, dll_path=dll_path)
 
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 09b2a0697447e..fdd8c09333737 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -424,19 +424,32 @@ stages:
     - checkout: self
       submodules: false
     - template: templates/set-version-number-variables-step.yml
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Final Jar'
-      inputs:
-        buildType: 'current'
-        artifactName: 'onnxruntime-java-gpu'
-        targetPath: '$(Build.BinariesDirectory)/final-jar'
 
-    - task: Bash@3
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Final Jar'
+        ArtifactName: onnxruntime-java-gpu
+        TargetPath: '$(Build.BinariesDirectory)/final-jar'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+        Context: tools/ci_build/github/linux/docker/
+        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+        Repository: onnxruntimeubi8packagestest
+        UpdateDepsTxt: false
+
+    - bash: |
+        docker run --rm \
+          --gpus all \
+          --volume $(Build.SourcesDirectory):/onnxruntime_src \
+          --volume $(Build.BinariesDirectory):/build \
+          --volume /data/models:/build/models:ro \
+          onnxruntimeubi8packagestest \
+          /bin/bash /onnxruntime_src/tools/ci_build/github/linux/java_linux_final_test.sh -r /build -v $(OnnxRuntimeVersion)
       displayName: 'Test'
-      inputs:
-        targetType: filePath
-        filePath: 'tools/ci_build/github/linux/java_linux_final_test.sh'
-        arguments: '-r $(Build.BinariesDirectory) -v $(OnnxRuntimeVersion)'
 
     - template: templates/component-governance-component-detection-steps.yml
       parameters:
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index eb6b274f87d6b..21bc1c481b3e6 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -141,77 +141,39 @@ stages:
               "
         displayName: 'Dotnet build C# sln and Test'
 
-      - task: CmdLine@2
-        displayName: 'Install python deps'
-        inputs:
-          script: |
-             set -e -x
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml onnx -qq
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $(Build.BinariesDirectory)/requirements.txt
-             # Test ORT with the latest ONNX release.
-             sed -i "s/git+http:\/\/github\.com\/onnx\/onnx.*/onnx/" $(Build.BinariesDirectory)/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements.txt
-             mkdir $(Build.BinariesDirectory)/requirements_torch_cpu/
-             cp $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-             python3 -m pip install -r $(Build.BinariesDirectory)/requirements_torch_cpu/requirements.txt
-
-      - task: CmdLine@2
-        displayName: 'Install Release python package'
-        inputs:
-          script: |
-             rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11
-             python3 -m pip install $(Build.BinariesDirectory)/Release/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Release unit tests'
-        inputs:
-           scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-           workingDirectory: $(Build.BinariesDirectory)/Release
-           arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Release
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
-
-      - task: CmdLine@2
-        displayName: 'Install Debug python package'
-        inputs:
-          script: |
-             set -e -x
-             rm -rf $(Build.BinariesDirectory)/Debug/onnxruntime $(Build.BinariesDirectory)/Debug/pybind11
-             python3 -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml ort-nightly-directml -qq
-             python3 -m pip install $(Build.BinariesDirectory)/Debug/dist/*.whl
-
-      - task: PythonScript@0
-        displayName: 'Run Debug unit tests'
-        inputs:
-          scriptPath: $(Build.SourcesDirectory)/tools/ci_build/build.py
-          workingDirectory: $(Build.BinariesDirectory)/Debug
-          arguments: >-
-              --build_dir $(Build.BinariesDirectory)
-              --cmake_generator Ninja
-              --config Debug
-              --test
-              --skip_submodule_sync
-              --build_shared_lib
-              --parallel
-              --build_wheel
-              --enable_onnx_tests
-              --enable_transformers_tool_test
-              --ctest_path ""
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+              /bin/bash -c "
+                set -ex; \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \
+                /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build
+              "
+        displayName: 'Run Release tests and symbolic shape infer test'
 
-      - task: PythonScript@0
-        displayName: 'Symbolic shape infer'
-        inputs:
-          scriptPath: $(Build.BinariesDirectory)/Release/onnxruntime_test_python_symbolic_shape_infer.py
-          workingDirectory: $(Build.BinariesDirectory)/Release
+      - bash: |
+          mkdir -p $HOME/.onnx
+          docker run --rm \
+            --volume /data/onnx:/data/onnx:ro \
+            --volume $(Build.SourcesDirectory):/onnxruntime_src \
+            --volume $(Build.BinariesDirectory):/build \
+            --volume /data/models:/build/models:ro \
+            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+            -e NIGHTLY_BUILD \
+            -e BUILD_BUILDNUMBER \
+            onnxruntimecpubuild \
+                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Debug
+        displayName: 'Run Debug tests'
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
@@ -221,7 +183,6 @@ stages:
           testRunTitle: 'Unit Test Run'
         condition: succeededOrFailed()
 
-
 - stage: arm64_build
   dependsOn: []
   jobs:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index a41ca5f02467d..7824bf2203efe 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -23,7 +23,7 @@ jobs:
         --rm \
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
-        ptebic.azurecr.io/internal/azureml/aifx/nightly-ubuntu2004-cu118-py38-torch210dev \
+        ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
          bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 113b24f7579ac..61f9b37d4ce78 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -74,7 +74,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -95,7 +94,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: TRT
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-MultiA10
@@ -114,7 +112,6 @@ stages:
       isX86: false
       job_name_suffix: x64_mimalloc
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -134,7 +131,6 @@ stages:
       isX86: false
       job_name_suffix: x64_no_memory_profiling
       RunOnnxRuntimeTests: false
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -154,7 +150,6 @@ stages:
       isX86: false
       job_name_suffix: x64_minimal_no_exception
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
@@ -174,7 +169,6 @@ stages:
       isX86: false
       job_name_suffix: x64_debug_node_input_output
       RunOnnxRuntimeTests: true
-      RunStaticCodeAnalysis: false
       isTraining: false
       ORT_EP_NAME: CPU
       GenerateDocumentation: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/compliance.yml b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
index f4bce8c53605b..cc451425be42a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/compliance.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
@@ -18,27 +18,6 @@ steps:
     AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
   continueOnError: true
 
-- task: DeleteFiles@1
-  displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-  inputs:
-    SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-    Contents: |
-     **/*.obj
-     **/*.pdb
-     **/*.dll
-
-# Manually set msBuildCommandline so that we can also set CAExcludePath
-- task: SDLNativeRules@3
-  displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-  inputs:
-    userProvideBuildInfo: msBuildInfo
-    msBuildArchitecture: x64
-    msBuildVersion: 17.0
-    msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-    excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-    rulesetName: Custom
-    customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
 - task: SdtReport@2
   displayName: 'Create Security Analysis Report'
   inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml b/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
index 0f4e0553d05bf..a83451a1b33d9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/flex-downloadPipelineArtifact.yml
@@ -18,7 +18,7 @@ parameters:
 
 steps:
   - task: DownloadPipelineArtifact@2
-    displayName: ${{ parameters.StepName }}}
+    displayName: ${{ parameters.StepName }}
     inputs:
       artifactName: ${{ parameters.ArtifactName}}
       targetPath: '${{ parameters.TargetPath }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 67a03beab9362..46f2ae7b97acc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -34,11 +34,6 @@ parameters:
   type: boolean
   default: true
 
-- name: RunStaticCodeAnalysis
-  displayName: Run Static Code Analysis
-  type: boolean
-  default: true
-
 - name: ORT_EP_NAME
   type: string
 
@@ -220,49 +215,6 @@ jobs:
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Run tests'
 
-
-  - ${{ if eq(parameters.RunStaticCodeAnalysis, true) }}:
-      - task: DeleteFiles@1
-        displayName: 'Delete binaries files from $(Build.BinariesDirectory)\RelWithDebInfo'
-        inputs:
-          SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-          Contents: |
-           **/*.obj
-           **/*.pdb
-           **/*.dll
-
-
-      # Manually set msBuildCommandline so that we can also set CAExcludePath
-      # build_dir must be a sub folder of $(Build.SourcesDirectory)
-      # TODO: move this step to a CPU-only machine to save GPU resources.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\RelWithDebInfo\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=${{ parameters.msbuildPlatform }} /p:configuration=RelWithDebInfo /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-          publishXML: true
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
-      - task: PublishSecurityAnalysisLogs@3
-        displayName: 'Publish Security Analysis Logs'
-        continueOnError: true
-
-      - task: PostAnalysis@2
-        displayName: 'Guardian Break v2'
-        inputs:
-          GdnBreakGdnToolSDLNativeRulesSeverity: Note
-          GdnBreakGdnToolSDLNativeRules: true
-
-
-  - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
         inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 8812d4ed91ae7..1305f5ae21725 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -246,24 +246,6 @@ stages:
         workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
         displayName: 'Run Python Tests'
 
-      #Skip it for 32 bits x86 build. Currently the scan tool has a bug: it doesn't allow me use 64 bits link.exe
-      #in 32 bits Win32 build. I tried all the settings but they all don't work.
-      - task: SDLNativeRules@3
-        displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-        condition: and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8')))
-        inputs:
-          msBuildArchitecture: amd64
-          setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind --enable_onnx_tests --parallel $(TelemetryOption) --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-          msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-          excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-          rulesetName: Custom
-          customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-
-      - task: SdtReport@2
-        displayName: 'Create Security Analysis Report'
-        inputs:
-          SDLNativeRules: true
-
       - task: TSAUpload@2
         displayName: 'TSA upload'
         condition: and(and (succeeded(), and(eq(variables['buildArch'], 'x64'), eq(variables['PythonVersion'], '3.8'))), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index ef938a634554a..919749cac15b6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -22,65 +22,6 @@ parameters:
   default: ''
 
 jobs:
-- ${{ if eq(parameters.PYTHON_VERSION, '3.8') }}:
-    - job: Win_py_${{ parameters.EP_NAME }}_Wheels_StaticAnalysis
-      timeoutInMinutes: 240
-      workspace:
-        clean: all
-      pool: onnxruntime-Win-CPU-2022
-      steps:
-          - checkout: self
-            clean: true
-            submodules: none
-          - task: UsePythonVersion@0
-            inputs:
-              versionSpec: 3.8
-              addToPath: true
-              architecture: 'x64'       
-          - task: onebranch.pipeline.tsaoptions@1
-            displayName: 'OneBranch TSAOptions'
-            inputs:
-              tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json'
-              appendSourceBranchName: false
-              
-          - template: download-deps.yml
-
-          - template: jobs/set-winenv.yml
-            parameters:
-              EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }}
-              DownloadCUDA: true
-
-          - task: PythonScript@0
-            displayName: 'Update deps.txt'
-            inputs:
-              scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py
-              arguments: --new_dir $(Build.BinariesDirectory)/deps
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: SDLNativeRules@3
-            displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-            inputs:
-              msBuildArchitecture: amd64
-              setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --build_dir $(Build.SourcesDirectory)\b --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --enable_pybind ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} --update --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON onnxruntime_ENABLE_LTO=OFF'
-              msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.SourcesDirectory)\b\Debug\onnxruntime.sln" /p:RunCodeAnalysis=true /p:platform=x64 /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-              excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files#C:\program files (x86)#C:\program files'
-              rulesetName: Custom
-              customRuleset: $(Build.SourcesDirectory)\cmake\Sdl.ruleset
-              publishXML: true
-
-          - task: SdtReport@2
-            displayName: 'Create Security Analysis Report'
-            inputs:
-              SDLNativeRules: true
-
-          - task: TSAUpload@2
-            displayName: 'TSA upload'
-            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-            inputs:
-              GdnPublishTsaOnboard: false
-              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' 
-
- 
 - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}
   timeoutInMinutes: 240
   workspace:
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index fe0f2c3791e72..cc2e8745e8946 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -91,7 +91,7 @@ jobs:
               --enable_training \
               --cmake_extra_defines \
                 CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-                onnxruntime_BUILD_UNIT_TESTS=OFF \
+                onnxruntime_BUILD_UNIT_TESTS=OFF FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER \
               ${{ variables['EnableProfiling'] }}
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Build onnxruntime (in container)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 2484facfae33e..81f17a26b16a6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -116,7 +116,8 @@ stages:
         xcodeDeveloperDir: '/Applications/Xcode_${{ variables.xcodeVersion }}.app/Contents/Developer'
         signingOption: 'manual'
         signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
-        provisioningProfileName: 'iOS Team Provisioning Profile'
+        provisioningProfileName: 'temporary *'  # temporary name, change it back to the original below later
+        #provisioningProfileName: 'iOS Team Provisioning Profile'
         args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test/ios_package_test/DerivedData'
         workingDirectory: '$(Build.BinariesDirectory)/app_center_test/ios_package_test/'
         useXcpretty: false  # xcpretty can hide useful error output so we will disable it
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index f6da7bb857b7d..80d285f3fd3fb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -263,25 +263,6 @@ stages:
             AnalyzeTargetGlob: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\**\*.dll'
           continueOnError: true
 
-        - task: DeleteFiles@1
-          displayName: 'Delete files from $(Build.BinariesDirectory)\RelWithDebInfo'
-          inputs:
-            SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo'
-            Contents: |
-             **/*.obj
-             **/*.pdb
-             **/*.dll
-
-        #Manually set msBuildCommandline so that we can also set CAExcludePath
-        - task: SDLNativeRules@3
-          displayName: 'Run the PREfast SDL Native Rules for MSBuild'
-          condition: and (succeeded(), eq(variables['msbuildPlatform'], 'x64'))
-          inputs:
-            msBuildArchitecture: amd64
-            setupCommandlines: 'python $(Build.SourcesDirectory)\tools\ci_build\build.py --config Debug --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }} --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON'
-            msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\Debug\onnxruntime.sln" /p:platform="$(MsbuildPlatform)" /p:configuration=Debug /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-            excludedPaths: '$(Build.BinariesDirectory)#$(Build.SourcesDirectory)\cmake#C:\program files (x86)'
-
         - task: PostAnalysis@2
           inputs:
             GdnBreakAllTools: false
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 713396dd64532..bad7448715936 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -161,7 +161,7 @@ jobs:
     displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
     condition: ne('${{ parameters.RunWebGpuTests }}', 'true')
   - script: |
-     npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu
+     npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu --chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: ne('${{ parameters.RunWebGpuTests }}', 'false')
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index b9b833a3155bf..2a5622faf2905 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -69,7 +68,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -89,7 +87,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: DNNL
         GenerateDocumentation: false
@@ -111,7 +108,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release
         RunOnnxRuntimeTests: true
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: XNNPACK
         GenerateDocumentation: false
@@ -132,7 +128,6 @@ stages:
         job_name_suffix: x64_release_winml
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         # WinML has many warnings
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
@@ -153,7 +148,6 @@ stages:
         isX86: true
         job_name_suffix: x86_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -173,7 +167,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_debug
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -193,7 +186,6 @@ stages:
         isX86: false
         job_name_suffix: training_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: true
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
@@ -213,7 +205,6 @@ stages:
         isX86: false
         job_name_suffix: ort_training_apis_x64_release
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: true
         ORT_EP_NAME: CPU
@@ -234,7 +225,6 @@ stages:
         isX86: false
         job_name_suffix: x64_release_azure
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         EnablePython: false
         isTraining: false
         ORT_EP_NAME: CPU
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 69e71c1266664..8796917afa37d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -47,7 +47,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-A10
@@ -65,7 +64,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
         # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
@@ -85,7 +83,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        RunStaticCodeAnalysis: false
         ORT_EP_NAME: DML
         WITH_CACHE: true
         MachinePool: onnxruntime-Win2022-GPU-dml-A10
@@ -104,7 +101,6 @@ stages:
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: false
-        RunStaticCodeAnalysis: false
         GenerateDocumentation: true
         ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation.
         WITH_CACHE: true
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index 4d0af63893643..957f1f8a812a5 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -40,7 +40,7 @@ docker run --rm \
     --use_rocm --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME \
     --build_shared_lib \
     --skip_submodule_sync \
-    --skip_tests \
+    --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER
 
 
 EXIT_CODE=$?
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
new file mode 100644
index 0000000000000..cdf504c8e3b03
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
@@ -0,0 +1,45 @@
+# --------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------
+# Dockerfile to Test ONNX Runtime on UBI8 with CUDA 11.8 and TensorRT 8.6
+
+# Build base image with required system packages
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 AS base
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
+
+RUN dnf install -y bash wget &&\
+    dnf clean dbcache
+
+# Install python3
+RUN dnf install -y \
+    python3.8 \
+    python38-pip \
+    python38-wheel &&\
+    cd /usr/local/bin &&\
+    ln -s /usr/bin/python3 python3.8 &&\
+    ln -s /usr/bin/pip3 pip3.8;
+
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools>=41.0.0
+
+# Install TensorRT
+RUN dnf install -y libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
+RUN v="8.6.1.6-1+cuda11.8" &&\
+    dnf downgrade -y libnvinfer8-${v} libnvinfer8-${v} libnvonnxparsers8-${v} libnvparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-lean8-${v} libnvinfer-vc-plugin8-${v} libnvinfer-dispatch8-${v} &&\
+    dnf install -y dnf-plugin-versionlock &&\
+    dnf versionlock libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
+RUN dnf clean dbcache
+
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && /tmp/scripts/install_java.sh && rm -rf /tmp/scripts
+
+# Build final image from base.
+FROM base as final
+ARG BUILD_USER=onnxruntimedev
+ARG BUILD_UID=1000
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/scripts/install_java.sh b/tools/ci_build/github/linux/docker/scripts/install_java.sh
new file mode 100755
index 0000000000000..d11e29f693b8b
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/scripts/install_java.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e -x
+
+if [ -f /etc/redhat-release ]; then
+    dnf install -y java-11-openjdk-devel \
+    && dnf clean dbcache
+elif [ -f /etc/os-release ]; then
+    apt-get update && apt-get install -y openjdk-11-jdk
+else
+  echo "Unsupported OS"
+  exit 1
+fi
diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index beee4efc74c30..dcc6a92d84ef2 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -71,6 +71,7 @@ def main():
     pipelines = [
         # windows
         "Windows ARM64 QNN CI Pipeline",
+        "Windows x64 QNN CI Pipeline",
         "Windows CPU CI Pipeline",
         "Windows GPU CI Pipeline",
         "Windows GPU TensorRT CI Pipeline",
diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh
new file mode 100644
index 0000000000000..bfdd4663feede
--- /dev/null
+++ b/tools/scripts/python_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -ex
+
+export src_dir=$1
+export build_dir=$2
+export config=$3
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Install Python Deps
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $build_dir/requirements.txt
+
+python3 -m pip install -r $build_dir/requirements.txt
+mkdir -p $build_dir/requirements_torch_cpu/
+cp $src_dir/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip install -r $build_dir/requirements_torch_cpu/requirements.txt
+python3 -m pip list | grep onnx
+
+echo Install $config python package
+rm -rf $build_dir/$config/onnxruntime $build_dir/$config/pybind11
+python3 -m pip install $build_dir/$config/dist/*.whl
+
+echo Run $config unit tests
+pushd $build_dir/$config/
+python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path ""
+popd
diff --git a/tools/scripts/symbolic_shape_infer_test.sh b/tools/scripts/symbolic_shape_infer_test.sh
new file mode 100644
index 0000000000000..d8d50c5e3fa91
--- /dev/null
+++ b/tools/scripts/symbolic_shape_infer_test.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -ex
+
+export build_dir=$1
+
+# it's for manylinux image
+export PATH=/opt/python/cp38-cp38/bin:$PATH
+
+echo Run symbolic shape infer test
+pushd $build_dir/Release/
+python3 /build/Release/onnxruntime_test_python_symbolic_shape_infer.py
+popd
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 5974d46b82c4f..eae7dc37941c7 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -3,6 +3,7 @@
 
 #include "iengine.h"
 #include "UniqueOrtPtr.h"
+#include "core/common/gsl.h"
 
 #include <memory>
 #include <mutex>