diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
deleted file mode 100644
index bf73fede16c7f..0000000000000
--- a/.github/actions/rust-toolchain-setup/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
-
-name: 'Rust toolchain setup'
-description: 'Common setup steps for GitHub workflows for Rust projects'
-
-runs:
- using: composite
- steps:
- - uses: dtolnay/rust-toolchain@1.71.0
- with:
- components: clippy, rustfmt
- - uses: extractions/setup-just@v1
- with:
- just-version: '1.15.0' # optional semver specification, otherwise latest
-
- ###
- ### Linux setup
- ###
- - name: rustup
- # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
- if: ${{ (runner.os == 'Linux') }}
- run: |
- rustup set profile minimal
- rustup install
- shell: bash
- # - name: Cargo login
- # if: ${{ (runner.os == 'Linux') }}
- # run: just cargo-login-ci
- # shell: bash
-
- ###
- ### Windows setup
- ###
- - name: rustup
- # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
- if: ${{ (runner.os == 'Windows') }}
- run: |
- rustup set profile minimal
- rustup install
- shell: pwsh
- # - name: Cargo login
- # if: ${{ (runner.os == 'Windows') }}
- # run: just cargo-login-ci-windows
- # shell: pwsh
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
deleted file mode 100644
index 725c40c2ded53..0000000000000
--- a/.github/workflows/rust-ci.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-name: Rust
-
-on: [pull_request]
-
-env:
- CARGO_TERM_COLOR: always
- RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
- RUST_BACKTRACE: 1
- MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
-
-jobs:
- fmt:
- name: Rustfmt
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - name: fmt
- run: cargo fmt --all -- --check
-
- download:
- name: Download prebuilt ONNX Runtime archive from build.rs
- runs-on: ubuntu-latest
- env:
- ORT_RUST_STRATEGY: download
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - run: rustup target install x86_64-unknown-linux-gnu
- - run: rustup target install x86_64-apple-darwin
- - run: rustup target install i686-pc-windows-msvc
- - run: rustup target install x86_64-pc-windows-msvc
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
- run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
- run: cargo build --target x86_64-apple-darwin --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
- run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
- run: cargo build --target i686-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
- run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
- # ******************************************************************
- - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
- run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
- run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
- # ******************************************************************
- - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
- env:
- ORT_USE_CUDA: "yes"
- run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
- run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
- # ******************************************************************
- - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
- env:
- ORT_USE_CUDA: "yes"
- run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
- run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
-
- test:
- name: Test Suite
- runs-on: ${{ matrix.os }}
- strategy:
- fail-fast: false
- matrix:
- target:
- [
- x86_64-unknown-linux-gnu,
- x86_64-apple-darwin,
- x86_64-pc-windows-msvc,
- i686-pc-windows-msvc,
- ]
- include:
- - target: x86_64-unknown-linux-gnu
- os: ubuntu-latest
- - target: x86_64-apple-darwin
- os: macos-latest
- - target: x86_64-pc-windows-msvc
- os: windows-latest
- - target: i686-pc-windows-msvc
- os: windows-latest
- env:
- CARGO_BUILD_TARGET: ${{ matrix.target }}
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: rustup target install ${{ matrix.target }}
- - name: Install additional packages (macOS)
- if: contains(matrix.target, 'x86_64-apple-darwin')
- run: brew install libomp
- - name: Build (cargo build)
- run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Build tests (cargo test)
- run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
- - name: Build onnxruntime with 'model-fetching' feature
- run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
- - name: Test onnxruntime-sys
- run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
- - name: Test onnxruntime
- run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
-
- clippy:
- name: Clippy
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
-
- package-sys:
- name: Package onnxruntime-sys
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v4
- - uses: ./.github/actions/rust-toolchain-setup
- - name: vendor onnxruntime source
- run: just vendor
- - run: cargo package --allow-dirty --package onnxruntime-sys
diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
index b9de1b79e1d51..67f9d8b0ce392 100644
--- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
+++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
@@ -53,10 +53,6 @@ extends:
BuildArch: x86
PythonPackageName: pythonx86
- - template: .pipelines/windowsai-steps.yml@self
- parameters:
- BuildArch: arm
-
- template: .pipelines/windowsai-steps.yml@self
parameters:
BuildArch: arm64
@@ -72,11 +68,6 @@ extends:
PythonPackageName: pythonx86
Runtime: static
- - template: .pipelines/windowsai-steps.yml@self
- parameters:
- BuildArch: arm
- Runtime: static
-
- template: .pipelines/windowsai-steps.yml@self
parameters:
BuildArch: arm64
@@ -94,11 +85,9 @@ extends:
dependsOn:
- Windows_Packaging_x64_dynamic
- Windows_Packaging_x86_dynamic
- - Windows_Packaging_arm_dynamic
- Windows_Packaging_arm64_dynamic
- Windows_Packaging_x64_static
- Windows_Packaging_x86_static
- - Windows_Packaging_arm_static
- Windows_Packaging_arm64_static
condition: succeeded()
steps:
@@ -120,12 +109,6 @@ extends:
artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic'
targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
- - task: DownloadPipelineArtifact@0
- displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
- inputs:
- artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic'
- targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm'
-
- task: DownloadPipelineArtifact@0
displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime'
inputs:
@@ -144,12 +127,6 @@ extends:
artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static'
targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime'
- - task: DownloadPipelineArtifact@0
- displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime'
- inputs:
- artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static'
- targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime'
-
- task: PowerShell@2
displayName: 'Bundle NuGet and other binaries'
inputs:
@@ -194,17 +171,7 @@ extends:
$arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package))
[System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory)
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
- $arm_nuget_package = $nupkgs[0].FullName
- $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
- $arm_static_runtime_nuget_package = $nupkgs[0].FullName
- $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory)
+
$x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native')
$x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static')
@@ -216,10 +183,7 @@ extends:
$arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static')
- $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static')
+
$uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native')
$uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0')
@@ -228,8 +192,6 @@ extends:
New-Item -Path $x86_static_runtime_path_new -ItemType Directory
New-Item -Path $arm64_runtime_path_new -ItemType Directory
New-Item -Path $arm64_static_runtime_path_new -ItemType Directory
- New-Item -Path $arm_runtime_path_new -ItemType Directory
- New-Item -Path $arm_static_runtime_path_new -ItemType Directory
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll')) $x86_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib')) $x86_runtime_path_new
@@ -241,11 +203,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm_runtime_path_new
-
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll'))
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib'))
Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
@@ -261,11 +218,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
- Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
-
Copy-Item -Recurse $uap_build_path_old $uap_build_path_new
$merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
@@ -304,22 +256,13 @@ extends:
$arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package))
[System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory)
- $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse)
- $arm_nuget_package = $nupkgs[0].FullName
- $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
- $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
- [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
$x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
$x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
$arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
$arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
- $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
- $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-
+
New-Item -Path $x86_runtime_path_new -ItemType Directory
New-Item -Path $arm64_runtime_path_new -ItemType Directory
- New-Item -Path $arm_runtime_path_new -ItemType Directory
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb')) $x86_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $x86_runtime_path_new
@@ -327,9 +270,6 @@ extends:
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb')) $arm64_runtime_path_new
Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb')) $arm_runtime_path_new
- Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm_runtime_path_new
-
$merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
if (!(Test-Path $merged_nuget_path)) {
New-Item -Path $merged_nuget_path -ItemType Directory
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2f2adc78f6de9..3e2b1f31dd6cf 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,7 +11,7 @@
// Auto sort imports
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
- "source.organizeImports": true
+ "source.organizeImports": "explicit"
},
"editor.defaultFormatter": "ms-python.black-formatter"
},
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 34355fb0fd936..1567da90cacfc 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -131,6 +131,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
+option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -354,13 +355,7 @@ if (onnxruntime_USE_ROCM)
endif()
endif()
-if (APPLE)
- if (NOT CMAKE_OSX_ARCHITECTURES)
- message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
- endif()
-elseif (NOT WIN32 AND NOT APPLE)
- message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
# Single output director for all binaries
set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -493,6 +488,14 @@ endif()
include(adjust_global_compile_flags.cmake)
+if (APPLE)
+ if (NOT CMAKE_OSX_ARCHITECTURES)
+ message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+ endif()
+elseif (NOT WIN32 AND NOT APPLE)
+ message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
# We need to link with libatomic on systems that do not have built-in atomics, or
# don't have built-in support for 8 byte atomics
# Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -639,7 +642,16 @@ else()
check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+ if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+ check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+ if(NOT HAS_ARM64_BFLOAT16)
+ message(FATAL_ERROR "The compiler doesn't support BFLOAT16!!!")
+ endif()
+ check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+ if(NOT HAS_ARM64_FLOAT16)
+ message(FATAL_ERROR "The compiler doesn't support FLOAT16!!!")
+ endif()
+ endif()
if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
#we may have extra null pointer checkings in debug build, it's not an issue
list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
@@ -1099,7 +1111,7 @@ function(onnxruntime_add_include_to_target dst_target)
endfunction()
# ACL
-if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
+if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
set(onnxruntime_USE_ACL ON)
if (onnxruntime_USE_ACL_1902)
add_definitions(-DACL_1902=1)
@@ -1110,7 +1122,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
if (onnxruntime_USE_ACL_2002)
add_definitions(-DACL_2002=1)
else()
- add_definitions(-DACL_1905=1)
+ if (onnxruntime_USE_ACL_2308)
+ add_definitions(-DACL_2308=1)
+ else()
+ add_definitions(-DACL_1905=1)
+ endif()
endif()
endif()
endif()
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index e825bfeaea952..9f00c873715f4 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -300,6 +300,31 @@ if (MSVC)
endif()
else()
if (NOT APPLE)
+ #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+ #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+ # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+ # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+ if(NOT CMAKE_SYSTEM_PROCESSOR)
+ message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+ # Try to detect it
+ if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+ execute_process(
+ COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+ OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_VARIABLE _err
+ RESULT_VARIABLE _res
+ )
+ if(NOT _res EQUAL 0)
+ message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+ endif()
+ string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+ list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+ if(GCC_TRIPLET_LEN EQUAL 4)
+ list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+ message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+ endif()
+ endif()
set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
endif()
if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)
diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ffd..f15d5b8dd6f80 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
+
+if (onnxruntime_USE_ROCM)
+ add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index b026369e12c80..5e38789b65137 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -383,6 +383,7 @@ Do not modify directly.*
|Squeeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* squeezed:**T**
or
*in* data:**T**
*out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|StringConcat|*in* X:**T**
*in* Y:**T**
*out* Z:**T**|20+|**T** = tensor(string)|
|StringNormalizer|*in* X:**tensor(string)**
*out* Y:**tensor(string)**|10+|**X** = tensor(string)|
|Sub|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 06fef6bf72cc9..8cd0d0051d1eb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4528,6 +4528,19 @@ struct OrtApi {
* \since Version 1.17.
*/
ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
+
+ /**
+ * Run fn in parallel
+ *
+ * \param[in] context
+ * \param[in] fn Function accepting usr_data and an integer as iterator
+ * \param[in] total The number of times fn is to be invoked
+ * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit
+ * \param[in] usr_data User data to be passed back to fn
+ *
+ * \since Version 1.17.
+ */
+ ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
};
/*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 16d9451624533..3773a01cb65a8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2057,6 +2057,7 @@ struct KernelContext {
Logger GetLogger() const;
OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+ void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
private:
OrtKernelContext* ctx_;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 63e55603736b6..db4619eeeae62 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1658,6 +1658,10 @@ inline Logger KernelContext::GetLogger() const {
return Logger{out};
}
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+ ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
}
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c1cf8af4bb80e..542eebe746d59 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -336,9 +336,9 @@
"dev": true
},
"node_modules/follow-redirects": {
- "version": "1.15.2",
- "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
- "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+ "version": "1.15.4",
+ "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+ "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
"dev": true,
"funding": [
{
@@ -1242,9 +1242,9 @@
"dev": true
},
"follow-redirects": {
- "version": "1.15.2",
- "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
- "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+ "version": "1.15.4",
+ "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+ "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
"dev": true
},
"form-data": {
diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index fb714bf5996f1..b3868871a4753 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -18,6 +18,10 @@ interface BuildDefinitions {
* defines whether to disable the whole WebGpu backend in the build.
*/
readonly DISABLE_WEBGPU: boolean;
+ /**
+ * defines whether to disable the whole WebNN backend in the build.
+ */
+ readonly DISABLE_WEBNN: boolean;
/**
* defines whether to disable the whole WebAssembly backend in the build.
*/
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 499327741c82b..4f1a3943de69a 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -28,7 +28,9 @@ if (!BUILD_DEFS.DISABLE_WASM) {
registerBackend('wasm', wasmBackend, 10);
if (BUILD_DEFS.DISABLE_TRAINING) {
registerBackend('xnnpack', wasmBackend, 9);
- registerBackend('webnn', wasmBackend, 9);
+ if (!BUILD_DEFS.DISABLE_WEBNN) {
+ registerBackend('webnn', wasmBackend, 9);
+ }
}
}
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 8e1ec782079be..90e02da986b8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -2,7 +2,7 @@
// Licensed under the MIT License.
import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
-import {attention, parseAttentionAttributes} from './ops/attention';
+import {attention} from './ops/attention';
import {batchNorm} from './ops/batch-norm';
import {biasAdd} from './ops/bias-add';
import {biasSplitGelu} from './ops/bias-split-gelu';
@@ -16,11 +16,11 @@ import {expand} from './ops/expand';
import {gather, parseGatherAttributes} from './ops/gather';
import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
import {gemm, parseGemmAttributes} from './ops/gemm';
-import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {instanceNorm} from './ops/instance-norm';
+import {layerNorm} from './ops/layer-norm';
import {matMul} from './ops/matmul';
import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
-import {pad, parsePadAttributes} from './ops/pad';
+import {pad} from './ops/pad';
import * as pool from './ops/pool';
import {range} from './ops/range';
import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
@@ -50,7 +50,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new
['Asinh', [unaryOps.asinh]],
['Atan', [unaryOps.atan]],
['Atanh', [unaryOps.atanh]],
- ['Attention', [attention, parseAttentionAttributes]],
+ ['Attention', [attention]],
// TODO: support new attributes for AveragePool-10
['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
['BatchNormalization', [batchNorm]],
@@ -82,8 +82,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new
['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
['Greater', [binaryOps.greater]],
['GreaterOrEqual', [binaryOps.greaterOrEqual]],
- ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
- ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+ ['InstanceNormalization', [instanceNorm]],
+ ['LayerNormalization', [layerNorm]],
['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
['Less', [binaryOps.less]],
['LessOrEqual', [binaryOps.lessOrEqual]],
@@ -95,7 +95,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new
['MultiHeadAttention', [multiHeadAttention, parseMultiHeadAttentionAttributes]],
['Neg', [unaryOps.neg]],
['Not', [unaryOps.not]],
- ['Pad', [pad, parsePadAttributes]],
+ ['Pad', [pad]],
['Pow', [binaryOps.pow]],
['Range', [range]],
['Reciprocal', [unaryOps.reciprocal]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
index e1f2a47301bfb..ef8038dff487e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -1,11 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
+import {tensorDataTypeEnumToString} from '../../../wasm-common';
import {TensorView} from '../../tensor-view';
-import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, tensorTypeToWsglValueType, UniformDataElementType, UniformsArrayType} from './common';
export const enum AttentionQkvFormat {
unknown, // enum value not set, or depends on qkv projection implementation details
@@ -231,20 +231,8 @@ const validateAttentionInputs = (inputs: readonly TensorView[], attributes: Atte
};
};
-export const parseAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
- createAttributeWithCacheKey({...attributes});
-
export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView, n: number, d: number) => {
const components = getMaxComponents(d);
- const inputHelper = outputVariable('x', input.dataType, input.dims, components);
-
- let threadMaxValue = 'threadMaxVector';
- if (components === 2) {
- threadMaxValue = 'max(threadMaxVector.x, threadMaxVector.y)';
- } else if (components === 4) {
- threadMaxValue = 'max(max(threadMaxVector.x, threadMaxVector.y), max(threadMaxVector.z, threadMaxVector.w))';
- }
- const dataType = tensorTypeToWsglStorageType(input.dataType);
let WG = 64;
const dComp = d / components;
if (dComp < WG) {
@@ -253,25 +241,41 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
WG = Math.ceil(dComp / 8);
}
const elementsPerWG = Math.ceil(d / components / WG);
+ const tensorDataType = tensorDataTypeEnumToString(input.dataType) as ProgramUniform['type'];
+ const programUniforms: ProgramUniform[] =
+ [{type: tensorDataType, data: 1 / d}, {type: 'uint32', data: dComp}, {type: 'uint32', data: elementsPerWG}];
+ const dataType = tensorTypeToWsglStorageType(input.dataType, components);
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const inputHelper = outputVariable('x', input.dataType, input.dims, components);
+ let threadMaxValue = 'thread_max_vector';
+ if (components === 2) {
+ threadMaxValue = 'max(thread_max_vector.x, thread_max_vector.y)';
+ } else if (components === 4) {
+ threadMaxValue =
+ 'max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))';
+ }
+ const elemValueType = tensorTypeToWsglValueType(input.dataType);
+ const uniforms: UniformsArrayType = [
+ {name: 'd_inv', type: elemValueType as UniformDataElementType}, {name: 'd_comp', type: 'u32'},
+ {name: 'elements_per_wg', type: 'u32'}
+ ];
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const dInv: ${dataType} = 1 / ${d};
- const dComp = ${d / components};
+ return `
var wgMax: array;
var wgSum: array;
-
- ${shaderHelper.declareVariables(inputHelper)}
- @compute @workgroup_size(${WG}, 1, 1)
- fn main(@builtin(workgroup_id) workgroup_id : vec3,
- @builtin(local_invocation_index) local_index : u32) {
- let localOffset = local_index * ${elementsPerWG};
- let offset: u32 = workgroup_id.x * dComp + localOffset;
-
- var threadMaxVector = ${fillVector('f32', components, '-3.402823e+38f')};
- for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
- threadMaxVector = max(${castToF32(dataType, components, 'x[offset + i]')}, threadMaxVector);
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(inputHelper)}
+ ${shaderHelper.mainStart([
+ WG, 1, 1
+ ])}
+ let localOffset = local_idx * uniforms.elements_per_wg;
+ let offset: u32 = workgroup_id.x * uniforms.d_comp + localOffset;
+
+ var thread_max_vector = ${fillVector('f32', components, '-3.402823e+38f')};
+ for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+ thread_max_vector = max(${castToF32(elemValueType, components, 'x[offset + i]')}, thread_max_vector);
}
- wgMax[local_index] = ${threadMaxValue};
+ wgMax[local_idx] = ${threadMaxValue};
workgroupBarrier();
var maxValue = -3.402823e+38f;
@@ -280,10 +284,10 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
}
var sumVector = ${fillVector('f32', components, '0')};
- for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
- sumVector += exp(${castToF32(dataType, components, 'x[offset + i]')} - maxValue);
+ for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+ sumVector += exp(${castToF32(elemValueType, components, 'x[offset + i]')} - maxValue);
}
- wgSum[local_index] = ${sumVector('sumVector', components)};
+ wgSum[local_idx] = ${sumVector('sumVector', components)};
workgroupBarrier();
var sum: f32 = 0;
@@ -292,26 +296,24 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
}
if (sum == 0) {
- for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
- x[offset + i] = ${fillVector(dataType, components, 'dInv')};
+ for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+ x[offset + i] = ${fillVector('f32', components, 'uniforms.d_inv')};
}
} else {
- for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
- let f32input = ${castToF32(dataType, components, 'x[offset + i]')};
+ for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+ let f32input = ${castToF32(elemValueType, components, 'x[offset + i]')};
x[offset + i] = ${inputHelper.type.value}(exp(f32input - maxValue) / sum);
}
}
}`;
+ };
context.compute(
{
name: 'AttentionProbsSoftmax',
- shaderCache: {hint: `${d}`},
+ shaderCache: {hint: `${WG};${dataType};${components}`},
getShaderSource,
- getRunData: () => ({
- outputs: [],
- dispatchGroup: {x: n},
- }),
+ getRunData: () => ({outputs: [], dispatchGroup: {x: n}, programUniforms}),
},
{inputs: [input], outputs: []});
};
@@ -326,47 +328,43 @@ const computeAttentionProbs =
// TODO: handle mask
const alpha = attributes.scale === 0 ? 1.0 / Math.sqrt(parameters.headSize) : attributes.scale;
-
- const dataType = tensorTypeToWsglStorageType(q.dataType);
-
const components = getMaxComponents(parameters.headSize);
- const qInput = inputVariable('q', q.dataType, q.dims, components);
- const kInput = inputVariable('key', key.dataType, key.dims, components);
- const output = outputVariable('output', q.dataType, probsShape);
-
const vectorizedHeadSize = parameters.headSize / components;
- const M = parameters.sequenceLength;
- const N = parameters.totalSequenceLength;
- const K = vectorizedHeadSize;
-
const TILE_SIZE = 12;
-
const dispatch = {
x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE),
y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
z: parameters.batchSize * parameters.numHeads
};
+ const tensorDataType = tensorDataTypeEnumToString(q.dataType) as ProgramUniform['type'];
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: parameters.sequenceLength}, {type: 'uint32', data: vectorizedHeadSize},
+ {type: 'uint32', data: parameters.totalSequenceLength}, {type: 'uint32', data: parameters.kvSequenceLength},
+ {type: tensorDataType, data: alpha}
+ ];
const inputs = [q, key];
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const M: u32 = ${M}u;
- const N: u32 = ${N}u;
- const K: u32 = ${K}u;
- const alpha: ${dataType} = ${alpha};
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const qInput = inputVariable('q', q.dataType, q.dims, components);
+ const kInput = inputVariable('key', key.dataType, key.dims, components);
+ const output = outputVariable('output', q.dataType, probsShape);
+ const dataType = tensorTypeToWsglStorageType(q.dataType);
+
+ const uniforms: UniformsArrayType = [
+ {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+ {name: 'kv_sequence_length', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType}
+ ];
+ return `
const beta: ${dataType} = 1.0;
const TILE_SIZE = ${TILE_SIZE}u;
var tileQ: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
var tileK: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
- ${shaderHelper.declareVariables(qInput, kInput, output)}
-
- @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
- fn main(@builtin(workgroup_id) workgroup_id : vec3,
- @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) {
- let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
- workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(qInput, kInput, output)}
+ ${shaderHelper.mainStart([
+ TILE_SIZE, TILE_SIZE, 1
+ ])}
// x holds the N and y holds the M
let headIdx = workgroup_id.z;
let m = workgroup_id.y * TILE_SIZE;
@@ -374,40 +372,42 @@ const computeAttentionProbs =
let lm = m + local_id.y;
let ln = n + local_id.x;
- let qOffset = ${parameters.sequenceLength * vectorizedHeadSize} * headIdx + m * K;
- let kOffset = ${parameters.kvSequenceLength * vectorizedHeadSize} * headIdx + n * K;
+ let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;
+ let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K;
var value = ${fillVector(dataType, components)};
- for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
- if (m + local_id.y < M && w + local_id.x < K) {
- tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * K + w + local_id.x];
+ for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+ if (m + local_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+ tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
}
- if (n + local_id.y < N && w + local_id.x < K) {
- tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * K + w + local_id.x];
+ if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+ tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * uniforms.K + w + local_id.x];
}
workgroupBarrier();
- for (var k: u32 = 0u; k ({
outputs: [{dims: probsShape, dataType: q.dataType, gpuDataType: GpuDataType.default}],
dispatchGroup: dispatch,
+ programUniforms
}),
getShaderSource,
},
@@ -423,78 +423,76 @@ const computeAttentionProbs =
const computeVxAttentionScore =
(context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => {
const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize];
-
- const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
- const vHelper = inputVariable('v', v.dataType, v.dims);
- const output = outputVariable('output', probs.dataType, outputShape);
-
- const dataType = tensorTypeToWsglStorageType(probs.dataType);
-
const TILE_SIZE = 12;
const dispatch = {
x: Math.ceil(params.vHeadSize / TILE_SIZE),
y: Math.ceil(params.sequenceLength / TILE_SIZE),
z: params.batchSize * params.numHeads
};
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: params.sequenceLength}, {type: 'uint32', data: params.totalSequenceLength},
+ {type: 'uint32', data: params.vHeadSize}, {type: 'uint32', data: params.numHeads},
+ {type: 'uint32', data: params.vHiddenSize}
+ ];
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const M: u32 = ${params.sequenceLength}u;
- const N: u32 = ${params.vHeadSize}u;
- const K: u32 = ${params.totalSequenceLength}u;
- const numHeads: u32 = ${params.numHeads}u;
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
+ const vHelper = inputVariable('v', v.dataType, v.dims);
+ const output = outputVariable('output', probs.dataType, outputShape);
+ const uniforms: UniformsArrayType = [
+ {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+ {name: 'num_heads', type: 'u32'}, {name: 'v_hidden_size', type: 'u32'}
+ ];
+ return `
const TILE_SIZE = ${TILE_SIZE}u;
-
- var tileQ: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
- var tileK: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
- ${shaderHelper.declareVariables(probsHelper, vHelper, output)}
-
- @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
- fn main(@builtin(workgroup_id) workgroup_id : vec3,
- @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) {
- let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
- workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+ var tileQ: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+ var tileK: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(probsHelper, vHelper, output)}
+ ${shaderHelper.mainStart([
+ TILE_SIZE, TILE_SIZE, 1
+ ])}
let headIdx = workgroup_id.z;
let m = workgroup_id.y * TILE_SIZE + local_id.y;
let n = workgroup_id.x * TILE_SIZE + local_id.x;
- let offsetA = headIdx * (M * K) + m * K;
- let offsetB = headIdx * (N * K) + n;
+ let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K;
+ let offsetB = headIdx * (uniforms.N * uniforms.K) + n;
- var value = ${dataType}(0);
- for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
- if (m < M && w + local_id.x < K) {
+ var value = ${probsHelper.type.storage}(0);
+ for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+ if (m < uniforms.M && w + local_id.x < uniforms.K) {
tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
}
- if (n < N && w + local_id.y < K) {
- tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * N];
+ if (n < uniforms.N && w + local_id.y < uniforms.K) {
+ tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * uniforms.N];
}
workgroupBarrier();
- for (var k: u32 = 0u; k ({
outputs: [{dims: outputShape, dataType: probs.dataType, gpuDataType: GpuDataType.default}],
dispatchGroup: dispatch,
+ programUniforms
}),
getShaderSource,
},
@@ -517,71 +515,71 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
parameters.sequenceLength,
parameters.headSize,
];
-
- const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
-
const M = parameters.sequenceLength;
const K = parameters.inputHiddenSize;
const N = parameters.headSize;
-
const TILE_SIZE = 12;
const dispatch = {
x: Math.ceil(parameters.headSize / TILE_SIZE),
y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
z: parameters.batchSize * parameters.numHeads
};
+ const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: M}, {type: 'uint32', data: K}, {type: 'uint32', data: N},
+ {type: 'uint32', data: parameters.numHeads}, {type: 'uint32', data: parameters.headSize},
+ {type: 'uint32', data: parameters.hiddenSize},
+ {type: 'uint32', data: parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}
+ ];
- const getShaderSource = () => `
- const M: u32 = ${M}u;
- const K: u32 = ${K}u;
- const N: u32 = ${N}u;
- const numHeads: u32 = ${parameters.numHeads};
- const ldb = ${parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}u;
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const outputQ = outputVariable('output_q', inputs[0].dataType, outputShape);
+ const outputK = outputVariable('output_k', inputs[0].dataType, outputShape);
+ const outputV = outputVariable('output_v', inputs[0].dataType, outputShape);
+ const input = inputVariable('input', inputs[0].dataType, inputs[0].dims);
+ const weight = inputVariable('weight', inputs[1].dataType, inputs[1].dims);
+ const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+ const dataType = input.type.storage;
+
+ const uniforms: UniformsArrayType = [
+ {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'num_heads', type: 'u32'},
+ {name: 'head_size', type: 'u32'}, {name: 'hidden_size', type: 'u32'}, {name: 'ldb', type: 'u32'}
+ ];
+ return `
const TILE_SIZE = ${TILE_SIZE}u;
-
var tileInput: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
var tileWeightQ: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
var tileWeightK: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
var tileWeightV: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
-
- @group(0) @binding(0) var input: array<${dataType}>;
- @group(0) @binding(1) var weight: array<${dataType}>;
- @group(0) @binding(2) var bias: array<${dataType}>;
- @group(0) @binding(3) var outputQ: array<${dataType}>;
- @group(0) @binding(4) var outputK: array<${dataType}>;
- @group(0) @binding(5) var outputV: array<${dataType}>;
-
- @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
- fn main(@builtin(workgroup_id) workgroup_id : vec3,
- @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) {
- let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
- workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
- let batchIndex = workgroup_id.z / ${parameters.numHeads};
- let headNumber = workgroup_id.z % ${parameters.numHeads};
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(input, weight, bias, outputQ, outputK, outputV)}
+ ${shaderHelper.mainStart([
+ TILE_SIZE, TILE_SIZE, 1
+ ])}
+ let batchIndex = workgroup_id.z / uniforms.num_heads;
+ let headNumber = workgroup_id.z % uniforms.num_heads;
let m = workgroup_id.y * TILE_SIZE + local_id.y;
let n = workgroup_id.x * TILE_SIZE + local_id.x;
- let inputOffset = batchIndex * (M * K) + m * K;
- let biasOffsetQ = headNumber * ${parameters.headSize};
- let biasOffsetK = ${parameters.hiddenSize} + biasOffsetQ;
- let biasOffsetV = ${parameters.hiddenSize} + biasOffsetK;
+ let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+ let biasOffsetQ = headNumber * uniforms.head_size;
+ let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+ let biasOffsetV = uniforms.hidden_size + biasOffsetK;
var valueQ = ${dataType}(0);
var valueK = ${dataType}(0);
var valueV = ${dataType}(0);
- for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
- if (m < M && w + local_id.x < K) {
+ for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+ if (m < uniforms.M && w + local_id.x < uniforms.K) {
tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
}
- if (n < N && w + local_id.y < K) {
- let offset = n + (w + local_id.y) * ldb;
+ if (n < uniforms.N && w + local_id.y < uniforms.K) {
+ let offset = n + (w + local_id.y) * uniforms.ldb;
tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
}
workgroupBarrier();
- for (var k: u32 = 0u; k {
workgroupBarrier();
}
- let headOffset = (m * N + n) % ${parameters.headSize};
+ let headOffset = (m * uniforms.N + n) % uniforms.head_size;
valueQ += bias[headOffset + biasOffsetQ];
valueK += bias[headOffset + biasOffsetK];
valueV += bias[headOffset + biasOffsetV];
- let offset = workgroup_id.z * M * N;
- if (m < M && n < N) {
- let outputIdx = offset + m * N + n;
- outputQ[outputIdx] = valueQ;
- outputK[outputIdx] = valueK;
- outputV[outputIdx] = valueV;
+ let offset = workgroup_id.z * uniforms.M * uniforms.N;
+ if (m < uniforms.M && n < uniforms.N) {
+ let outputIdx = offset + m * uniforms.N + n;
+ output_q[outputIdx] = valueQ;
+ output_k[outputIdx] = valueK;
+ output_v[outputIdx] = valueV;
}
}`;
-
- const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+ };
return context.compute(
{
name: 'AttentionPrepare',
- shaderCache: {hint: JSON.stringify(parameters)},
+ shaderCache: {inputDependencies: ['type', 'type', 'type']},
getRunData: () => ({
outputs: [
{dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
@@ -619,6 +616,7 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
{dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
],
dispatchGroup: dispatch,
+ programUniforms
}),
getShaderSource,
},
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 3ce114c5d3884..bc3265be955f0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -780,8 +780,10 @@ class ShaderHelperImpl implements ShaderHelper {
const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3,
+ @builtin(workgroup_id) workgroup_id : vec3,
@builtin(local_invocation_id) local_id : vec3` :
- `@builtin(local_invocation_index) local_idx : u32,
+ `@builtin(local_invocation_id) local_id : vec3,
+ @builtin(local_invocation_index) local_idx : u32,
@builtin(workgroup_id) workgroup_id : vec3,
@builtin(num_workgroups) num_workgroups : vec3`;
const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 14482272bad38..21b4953d3f90c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -3,9 +3,9 @@
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
-import {ProgramInfo} from '../types';
+import {ProgramInfo, ProgramUniform} from '../types';
-import {inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
import {calculateOutputShape, ConvAttributes} from './conv';
import {getActivationSnippet} from './fuse-utils';
@@ -95,3 +95,98 @@ export const createGroupedConvProgramInfo =
getShaderSource,
};
};
+
+export const createGroupedConvVectorizeProgramInfo =
+ (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+ const hasBias = inputs.length > 2;
+ const components = getMaxComponents(outputShape[3]);
+ const outputNumber = getMaxComponents(outputShape[2]);
+ const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+ const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components];
+ const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components];
+ const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components];
+
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: outputSize}, {type: 'int32', data: attributes.strides},
+ {type: 'int32', data: attributes.pads}, ...createTensorShapeVariables(xShape),
+ ...createTensorShapeVariables(wShape), ...createTensorShapeVariables(outputShapeInShader)
+ ];
+ const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1];
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+ const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value);
+ const x = inputVariable('x', inputs[0].dataType, xShape.length, components);
+ const w = inputVariable('w', inputs[1].dataType, wShape.length, components);
+ const inputVars = [x, w];
+ if (hasBias) {
+ inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components));
+ }
+ const processBias = hasBias ? 'value += b[output_channel];' : '';
+
+ return `
+ ${
+ shaderHelper.registerUniform('output_size', 'u32')
+ .registerUniform('strides', 'i32', 2)
+ .registerUniform('pads', 'i32', 2)
+ .declareVariables(...inputVars, output)}
+ ${activationFunction}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ let width0 = uniforms.output_shape[3];
+ let output_channel = global_idx % width0;
+ var index1 = global_idx / width0;
+ let width1 = uniforms.output_shape[2] / ${outputNumber}u;
+ let col = (index1 % width1) * ${outputNumber}u;
+ index1 = index1 / width1;
+ let row = index1 % uniforms.output_shape[1];
+ let batch = index1 / uniforms.output_shape[1];
+
+ let x_corner = vec2(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+ var x_vals: array<${x.type.value}, ${xNumber}>;
+ var values: array<${output.type.value}, ${outputNumber}>;
+ let input_channel = output_channel;
+ // Use constant instead of uniform can give better performance for w's height/width.
+ for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) {
+ let x_height = x_corner.x + i32(w_height);
+ if (x_height >= 0 || u32(x_height) < uniforms.x_shape[1]) {
+ for (var i = 0; i < ${xNumber}; i++) {
+ let x_width = x_corner.y + i;
+ if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+ x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')};
+ } else {
+ x_vals[i] = ${x.type.value}(0);
+ }
+ }
+ for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) {
+ let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')};
+ for (var i = 0u; i < ${outputNumber}u; i++) {
+ values[i] = fma(x_vals[i * ${attributes.strides[1]}u + w_width], w_val, values[i]);
+ }
+ }
+ }
+ }
+
+ for (var i = 0u; i < ${outputNumber}u; i++) {
+ var value = values[i];
+ ${processBias}
+ ${applyActivation}
+ ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')};
+ }
+ }`;
+ };
+
+ return {
+ name: 'GroupedConv-Vectorize',
+ shaderCache: {
+ hint: `${attributes.activationCacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`,
+ inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
+ },
+ getRunData: () => ({
+ outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+ dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+ programUniforms
+ }),
+ getShaderSource,
+ };
+ };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 33a5db7ff6b25..7af2c5db49f40 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -8,7 +8,7 @@ import {ComputeContext} from '../types';
import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createGroupedConvProgramInfo} from './conv-grouped';
+import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped';
import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
import {createNaiveMatmulProgramInfo} from './matmul';
import {createTransposeProgramInfo} from './transpose';
@@ -136,12 +136,36 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
// check attributes
// const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+ const isChannelsLast = attributes.format === 'NHWC';
if (attributes.group !== 1) {
- context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+ // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+ // [webgpu]Conv - conv - vectorize group - B
+ // [webgpu]Conv - conv - vectorize group - D
+ const disableGroupedConvVectorize = true;
+ if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+ inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+ const outputShape = calculateOutputShape(
+ inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+ isChannelsLast);
+ const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+ context.compute(
+ createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+ {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+ if (attributes.wIsConst && !context.kernelCustomData.wT) {
+ context.kernelCustomData.wT = transposedWeight;
+ }
+ const convInputs = [inputs[0], transposedWeight];
+ if (inputs.length === 3) {
+ convInputs.push(inputs[2]);
+ }
+ context.compute(
+ createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
+ } else {
+ context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+ }
return;
}
- const isChannelsLast = attributes.format === 'NHWC';
const hasBias = inputs.length === 3;
const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 1c5d28e4b8e3f..30754c84413b7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -3,10 +3,10 @@
import {TensorView} from '../../tensor-view';
import {GemmUtil, ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
const validateInputs = (inputs: readonly TensorView[]): void => {
if (!inputs) {
@@ -34,25 +34,6 @@ export interface GemmAttributes extends AttributeWithCacheKey {
beta: number;
}
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
- if (dims.length === 0) {
- return '0u';
- }
-
- const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
- const broadcastN = dims[dims.length - 1] !== n;
-
- let offset = '0u';
- if (!broadcastM) {
- offset += `+ m * ${dims[dims.length - 1]}u`;
- }
- if (!broadcastN) {
- offset += '+n';
- }
-
- return offset;
-};
-
const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => {
const aShape = inputs[0].dims.slice();
const bShape = inputs[1].dims.slice();
@@ -63,68 +44,92 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
throw new Error('Can\'t use gemm on the given tensors');
}
const outputSize = ShapeUtil.size(outputShape);
- let line = '';
- if (attributes.transA && attributes.transB) {
- line = 'value += a[k * M + m] * b[n * K + k];';
- } else if (attributes.transA && !attributes.transB) {
- line = 'value += a[k * M + m] * b[k * N + n];';
- } else if (!attributes.transA && attributes.transB) {
- line = 'value += a[m * K + k] * b[n * K + k];';
- } else if (!attributes.transA && !attributes.transB) {
- line = 'value += a[m * K + k] * b[k * N + n];';
- }
-
- const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
- const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
- const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
- const inputStorageBuffersDeclarations = [
- `@group(0) @binding(0) var a : array<${dataType}>;`,
- `@group(0) @binding(1) var b : array<${dataType}>;`
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: outputSize}, {type: 'uint32', data: M}, {type: 'uint32', data: N}, {type: 'uint32', data: K},
+ {type: 'float32', data: attributes.alpha}, {type: 'float32', data: attributes.beta}
];
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
if (inputs.length === 3) {
- inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var c : array<${dataType}>;`);
+ programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+ inputDependencies.push('rank');
}
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const M: u32 = ${M}u;
- const N: u32 = ${N}u;
- const K: u32 = ${K}u;
- const alpha = ${dataType}(${attributes.alpha});
- const beta = ${dataType}(${attributes.beta});
+ programUniforms.push(...createTensorShapeVariables(outputShape));
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ let line = '';
+ if (attributes.transA && attributes.transB) {
+ line = 'value += a[k * uniforms.M + m] * b[n * uniforms.K + k];';
+ } else if (attributes.transA && !attributes.transB) {
+ line = 'value += a[k * uniforms.M + m] * b[k * uniforms.N + n];';
+ } else if (!attributes.transA && attributes.transB) {
+ line = 'value += a[m * uniforms.K + k] * b[n * uniforms.K + k];';
+ } else if (!attributes.transA && !attributes.transB) {
+ line = 'value += a[m * uniforms.K + k] * b[k * uniforms.N + n];';
+ }
- ${inputStorageBuffersDeclarations.join('\n')}
- @group(0) @binding(${inputs.length}) var output : array<${dataType}>;
+ const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= uniforms.alpha;';
+ const a = inputVariable('a', inputs[0].dataType, inputs[0].dims);
+ const b = inputVariable('b', inputs[1].dataType, inputs[1].dims);
+ const dataType = a.type.value;
+ let c: IndicesHelper|null = null;
+ const variables = [a, b];
+ if (inputs.length === 3) {
+ c = inputVariable('c', inputs[2].dataType, inputs[2].dims.length);
+ variables.push(c);
+ }
+ const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+ variables.push(output);
+ const uniforms: UniformsArrayType = [
+ {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'K', type: 'u32'},
+ {name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'}
+ ];
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
- let m = global_idx / N;
- let n = global_idx % N;
+ let m = global_idx / uniforms.N;
+ let n = global_idx % uniforms.N;
var value = ${dataType}(0);
- for (var k: u32 = 0u; k<${K}u; k++) {
+ for (var k: u32 = 0u; k < uniforms.K; k++) {
${line}
}
${calculateAlpha}
- ${calculateC}
+ ${(() => {
+ if (c != null) {
+ return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${
+ c.getByOffset('cOffset')};`;
+ }
+ return '';
+ })()}
output[global_idx] = value;
-
}`;
+ };
+
return {
name: 'Gemm',
- shaderCache: {hint: attributes.cacheKey},
+ shaderCache: {hint: `${attributes.cacheKey}`, inputDependencies},
getRunData: () => ({
outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
- dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+ dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+ programUniforms
}),
getShaderSource,
};
};
+export const parseGemmAttributes = (attributes: Record): GemmAttributes => {
+ const transA = attributes.transA as boolean;
+ const transB = attributes.transB as boolean;
+ const alpha = attributes.alpha as number;
+ const beta = attributes.beta as number;
+ return {transA, transB, alpha, beta, cacheKey: `${attributes.transA};${attributes.transB};${attributes.alpha === 1}`};
+};
+
export const gemm = (context: ComputeContext, attributes: GemmAttributes): void => {
validateInputs(context.inputs);
context.compute(createGemmProgramInfo(context.inputs, attributes));
};
-
-export const parseGemmAttributes = (attributes: Record): GemmAttributes =>
- createAttributeWithCacheKey(attributes as Omit);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 3a84844544c96..056dd54d54591 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -4,58 +4,56 @@
import {DataType} from '../../../wasm-common';
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
-import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
-export interface InstanceNormAttributes extends AttributeWithCacheKey {
+export interface InstanceNormAttributes {
epsilon: number;
format: 'NHWC'|'NCHW';
}
-const metadata = {
- name: 'InstanceNormalization'
-};
-
const createInstanceNormProgramInfo =
(inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
const xShape = inputs[0].dims;
-
const outputShape = xShape;
const axis = 2;
const normCount = ShapeUtil.sizeToDimension(xShape, axis);
const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
const components = getMaxComponents(normSize);
const normPackedSize = normSize / components;
- const C = xShape[1];
- const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
- const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
- const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
- const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
- const variables = [x, scale, bias, output];
- const dataType = x.type.value;
- const f32Type = components === 1 ? 'f32' : `vec${components}`;
- const workgroupSize = 64;
- const getShaderSource = (shaderHelper: ShaderHelper) => `
-
- const C: u32 = ${C};
- const normSize: u32 = ${normSize};
- const epsilon: f32 = ${attributes.epsilon};
+ const inputShape = [xShape[0], xShape[1], normPackedSize];
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'type', 'type'];
+ const programUniforms: ProgramUniform[] =
+ [{type: 'uint32', data: normSize}, {type: 'uint32', data: normPackedSize}];
+ programUniforms.push(...createTensorShapeVariables(inputShape), ...createTensorShapeVariables(inputShape));
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const x = inputVariable('x', inputs[0].dataType, inputShape.length, components);
+ const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+ const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+ const output = outputVariable('output', inputs[0].dataType, inputShape.length, components);
+ const variables = [x, scale, bias, output];
+ const dataType = x.type.value;
+ const f32Type = components === 1 ? 'f32' : `vec${components}`;
+ const workgroupSize = 64;
+
+ const uniforms: UniformsArrayType = [{name: 'normSize', type: 'u32'}, {name: 'normPackedSize', type: 'u32'}];
+ return `
var meanShared : f32;
var squaredNormShared : f32;
var workgroupShared : array<${f32Type}, ${workgroupSize}>;
const workgroupSize = ${workgroupSize}u;
- ${shaderHelper.declareVariables(...variables)}
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
${shaderHelper.mainStart(workgroupSize)}
let norm = global_idx / workgroupSize;
- let batch = norm / C;
- let channel = norm % C;
+ let batch = norm / uniforms.x_shape[1];
+ let channel = norm % uniforms.x_shape[1];
let localIndex = local_id.x;
// initialize workgroup memory
var initial = ${f32Type}(0);
- for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+ for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')});
}
workgroupShared[localIndex] = initial;
@@ -69,13 +67,13 @@ const createInstanceNormProgramInfo =
workgroupBarrier();
}
if (localIndex == 0) {
- meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize);
+ meanShared = ${sumVector('workgroupShared[0]', components)} / f32(uniforms.normSize);
}
workgroupBarrier();
// reinitialize workgroup memory.
initial = ${f32Type}(0);
- for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+ for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
let deviation = ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared);
initial = initial + deviation * deviation;
}
@@ -94,23 +92,26 @@ const createInstanceNormProgramInfo =
}
workgroupBarrier();
- let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon);
+ let invStdDev = 1 / sqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
- for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+ for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${
- f32Type}(channelShift));
+ f32Type}(channelShift));
${output.set('batch', 'channel', 'h', 'value')};
}
}`;
+ };
return {
- ...metadata,
- shaderCache: {hint: attributes.cacheKey},
+ ...{name: 'InstanceNormalization'},
+ // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+ shaderCache: {hint: `${attributes.epsilon};${components}`, inputDependencies},
getRunData: () => ({
outputs: [
{dims: outputShape, dataType: inputs[0].dataType},
],
- dispatchGroup: {x: normCount}
+ dispatchGroup: {x: normCount},
+ programUniforms
}),
getShaderSource,
};
@@ -120,10 +121,6 @@ const computeMean =
(context: ComputeContext, input: TensorView, scale: TensorView, bias: TensorView, n: number, h: number, c: number,
epsilon: number) => {
const components = getMaxComponents(c);
- const inputHelper = inputVariable('input', input.dataType, input.dims, components);
- const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
- const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
-
const WG = 64;
// we will store channel scale and channel shift in [2, components] matrix
// or in vec2 when components == 1
@@ -133,65 +130,79 @@ const computeMean =
const unitsOfWork = n * c / components;
const wgSize = Math.ceil(h / WG);
- const getMeanShaderSource = (shaderHelper: ShaderHelper) => `
- const H: u32 = ${h};
- const C: u32 = ${c / components};
- const imageSize: u32 = ${h * c / components};
+ const meanInputDependencies: ProgramInputTensorInfoDependency[] = ['type'];
+ const meanProgramUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: wgSize}, {type: 'uint32', data: h}, {type: 'uint32', data: Math.floor(c / components)},
+ {type: 'uint32', data: Math.floor(h * c / components)}
+ ];
+ const getMeanShaderSource = (shaderHelper: ShaderHelper) => {
+ const inputHelper = inputVariable('input', input.dataType, input.dims, components);
+ return `
${shaderHelper.declareVariables(inputHelper)}
@group(0) @binding(1) var output : array<${outputType}>;
+ struct Uniforms {wg_size:u32, H:u32, C:u32, image_size:u32};
+ @group(0) @binding(2) var uniforms: Uniforms;
${shaderHelper.mainStart(WG)}
- let currentImageNumber = global_idx / ${WG} / C;
- let currentChannelNumber = (global_idx / ${WG}) % C;
+ let currentImageNumber = global_idx / ${WG} / uniforms.C;
+ let currentChannelNumber = (global_idx / ${WG}) % uniforms.C;
let wgId = global_idx % ${WG};
- let wgOffset = wgId * ${wgSize};
- if (wgOffset >= H) {
+ let wgOffset = wgId * uniforms.wg_size;
+ if (wgOffset >= uniforms.H) {
return;
}
- let wgMax = min(wgOffset + ${wgSize}, H);
+ let wgMax = min(wgOffset + uniforms.wg_size, uniforms.H);
- let offset = currentImageNumber * imageSize + currentChannelNumber;
+ let offset = currentImageNumber * uniforms.image_size + currentChannelNumber;
var sum = ${fillVector('f32', components)};
var squaredSum = ${fillVector('f32', components)};
for (var i: u32 = wgOffset; i < wgMax; i++) {
- let value = ${sumCastType}(input[offset + i * C]);
+ let value = ${sumCastType}(input[offset + i * uniforms.C]);
sum += value;
squaredSum += value * value;
}
output[global_idx] = ${setOutputValue('sum', 'squaredSum')};
}`;
+ };
const meanValues = context.compute(
{
name: 'InstanceNormComputeMean',
- shaderCache: {hint: JSON.stringify({components, n, h, c})},
+ shaderCache: {hint: `${components}`, inputDependencies: meanInputDependencies},
getRunData: () => ({
outputs: [
{dims: [n, c, WG, 2], dataType: DataType.float},
],
dispatchGroup: {x: n * c / components},
+ programUniforms: meanProgramUniforms
}),
getShaderSource: getMeanShaderSource,
},
{inputs: [input], outputs: [-1]})[0];
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const H: u32 = ${h};
- const C: u32 = ${c / components};
- const imageSize: u32 = ${WG * c / components};
- const epsilon: f32 = ${epsilon};
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: unitsOfWork}, {type: 'uint32', data: h},
+ {type: 'uint32', data: Math.floor(c / components)}, {type: 'uint32', data: Math.floor(WG * c / components)}
+ ];
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type', 'type'];
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
+ const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
+ return `
@group(0) @binding(0) var input : array<${outputType}>;
@group(0) @binding(1) var scale : array<${scaleHelper.type.storage}>;
@group(0) @binding(2) var bias : array<${biasHelper.type.storage}>;
@group(0) @binding(3) var output : array<${outputType}>;
+ struct Uniforms {units_of_work : u32, H: u32, C : u32, image_size : u32};
+ @group(0) @binding(4) var uniforms: Uniforms;
${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(unitsOfWork)}
- let currentImageNumber = global_idx / C;
- let currentChannelNumber = global_idx % C;
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.units_of_work')}
+ let currentImageNumber = global_idx / uniforms.C;
+ let currentChannelNumber = global_idx % uniforms.C;
- let offset = currentImageNumber * imageSize;
+ let offset = currentImageNumber * uniforms.image_size;
var sum = ${fillVector('f32', components)};
var squaredSum = ${fillVector('f32', components)};
for (var i: u32 = 0; i < ${WG}; i++) {
@@ -199,24 +210,26 @@ const computeMean =
sum += value[0];
squaredSum += value[1];
}
- sum = sum / f32(H);
- squaredSum = squaredSum / f32(H);
- let invStdDev = 1 / sqrt(squaredSum - sum * sum + epsilon);
+ sum = sum / f32(uniforms.H);
+ squaredSum = squaredSum / f32(uniforms.H);
+ let invStdDev = 1 / sqrt(squaredSum - sum * sum + f32(${epsilon}));
let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]);
let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale;
output[global_idx] = ${setOutputValue('channelScale', 'channelShift')};
}`;
-
+ };
return context.compute(
{
name: 'InstanceNormComputeChannelScaleShift',
- shaderCache: {hint: JSON.stringify({components, n, h, c, epsilon})},
+ // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+ shaderCache: {hint: `${components};${epsilon}`, inputDependencies},
getRunData: () => ({
outputs: [
{dims: [n, c, 2], dataType: DataType.float},
],
dispatchGroup: {x: Math.ceil(unitsOfWork / 64 /* workgroup size */)},
+ programUniforms
}),
getShaderSource,
},
@@ -230,50 +243,51 @@ const createInstanceNormNHWCProgramInfo =
const N = xShape[0];
const C = xShape[xShape.length - 1];
const H = ShapeUtil.sizeFromDimension(xShape, 1) / C;
-
const components = getMaxComponents(C);
const outputSize = ShapeUtil.size(outputShape) / components;
- const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
- const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
-
- const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
- const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
- const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
+ const programUniforms: ProgramUniform[] =
+ [{type: 'uint32', data: H}, {type: 'uint32', data: Math.floor(C / components)}];
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
// first compute mean
const channelScaleShift = computeMean(context, inputs[0], inputs[1], inputs[2], N, H, C, attributes.epsilon);
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+ const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
+ const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const H: u32 = ${H};
- const C: u32 = ${C / components};
+ const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
+ const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
+ return `
@group(0) @binding(0) var input : array<${inputHelper.type.storage}>;
@group(0) @binding(1) var scaleInput : array<${scaleType}>;
@group(0) @binding(2) var output : array<${outputHelper.type.storage}>;
+ struct Uniforms {H: u32, C : u32};
+ @group(0) @binding(3) var uniforms: Uniforms;
${shaderHelper.mainStart()}
- let currentImageNumber = global_idx / (C * H);
- let currentChannelNumber = global_idx % C;
+ let currentImageNumber = global_idx / (uniforms.C * uniforms.H);
+ let currentChannelNumber = global_idx % uniforms.C;
- let scaleOffset = currentImageNumber * C + currentChannelNumber;
+ let scaleOffset = currentImageNumber * uniforms.C + currentChannelNumber;
let scale = scaleInput[scaleOffset];
output[global_idx] = fma(input[global_idx], ${scaleCastType}(scale[0]), ${scaleCastType}(scale[1]));
}`;
+ };
context.compute(
{
- name: 'InstanceNormalization',
- shaderCache: {hint: `${attributes.cacheKey}`},
+ name: 'InstanceNormalizationNHWC',
+ shaderCache: {hint: `${components}`, inputDependencies},
getRunData: () => ({
outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
- dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+ dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+ programUniforms
}),
getShaderSource,
},
{inputs: [inputs[0], channelScaleShift]});
};
-export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): InstanceNormAttributes =>
- createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
-
export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
if (attributes.format === 'NHWC') {
createInstanceNormNHWCProgramInfo(context, context.inputs, attributes);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9eeecf2c68d..bc446079faf8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -4,12 +4,11 @@
import {DataType} from '../../../wasm-common';
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common';
-export interface LayerNormAttributes extends AttributeWithCacheKey {
+interface LayerNormAttributes {
axis: number;
epsilon: number;
}
@@ -39,7 +38,7 @@ const createLayerNormProgramInfo =
Got scale size of ${scaleSize} and bias size of ${biasSize}`);
}
- const meanInvStdDevDim = [];
+ const meanInvStdDevDim: number[] = [];
for (let i = 0; i < xShape.length; ++i) {
if (i < axis) {
meanInvStdDevDim.push(xShape[i]);
@@ -47,50 +46,57 @@ const createLayerNormProgramInfo =
meanInvStdDevDim.push(1);
}
}
-
const components = getMaxComponents(normSize);
- const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
- const variables = [
- inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
- inputVariable('scale', scale.dataType, scale.dims, components),
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: normCount}, {type: 'float32', data: normSize},
+ {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon}
];
if (bias) {
- variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+ inputDependencies.push('type');
}
- variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-
const hasMeanDataOutput = outputCount > 1;
const hasInvStdOutput = outputCount > 2;
- if (hasMeanDataOutput) {
- variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim));
- }
- if (hasInvStdOutput) {
- variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
- }
-
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const normSize: f32 = ${normSize};
- const normSizeVectorized: u32 = ${normSize / components};
- const epsilon: f32 = ${attributes.epsilon};
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+ const variables = [
+ inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+ inputVariable('scale', scale.dataType, scale.dims, components),
+ ];
+ if (bias) {
+ variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+ }
+ variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+ if (hasMeanDataOutput) {
+ variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim));
+ }
+ if (hasInvStdOutput) {
+ variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+ }
- ${shaderHelper.declareVariables(...variables)}
+ const uniforms: UniformsArrayType = [
+ {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'},
+ {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'}
+ ];
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)}
- let offset = global_idx * normSizeVectorized;
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
+ let offset = global_idx * uniforms.norm_size_vectorized;
var meanVector = ${fillVector('f32', components)};
var meanSquareVector = ${fillVector('f32', components)};
- for (var h: u32 = 0u; h < normSizeVectorized; h++) {
+ for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
let value = ${castToF32(dataType, components, 'x[h + offset]')};
meanVector += value;
meanSquareVector += value * value;
}
- let mean = ${sumVector('meanVector', components)} / normSize;
- let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
- / normSize - mean * mean + epsilon);
+ let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
+ let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
+ / uniforms.norm_size - mean * mean + uniforms.epsilon);
- for (var j: u32 = 0; j < normSizeVectorized; j++) {
+ for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
let f32scale = ${castToF32(dataType, components, 'scale[j]')};
output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
@@ -98,9 +104,10 @@ const createLayerNormProgramInfo =
);
}
- ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''};
- ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''};
+ ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
+ ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''};
}`;
+ };
const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
if (hasMeanDataOutput) {
outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -111,15 +118,13 @@ const createLayerNormProgramInfo =
return {
name: 'LayerNormalization',
- shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`},
- getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}),
+ shaderCache: {hint: `${components};${outputCount}`, inputDependencies},
+ getRunData: () =>
+ ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}),
getShaderSource,
};
};
-export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes =>
- createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon});
-
export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => {
validateInputs(context.inputs);
context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount));
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
index b7726a36bcaad..6d22e3780efd9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
@@ -4,10 +4,10 @@
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
import {applyAttention, AttentionAttrs, AttentionMaskType, AttentionParameters, AttentionQkvFormat} from './attention';
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
import {createTransposeProgramInfo, TransposeAttributes} from './transpose';
const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttrs): AttentionParameters => {
@@ -228,7 +228,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttr
};
};
-
export const parseMultiHeadAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
createAttributeWithCacheKey({...attributes});
@@ -239,30 +238,35 @@ const addBiasTranspose =
hiddenSize: number, biasOffset: number) => {
const outputShape = [batchSize, sequenceLength, hiddenSize];
const outputSize = ShapeUtil.size(outputShape);
-
- const dataType = tensorTypeToWsglStorageType(qkv.dataType);
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- const biasOffset = ${biasOffset}u;
- const hiddenSize = ${hiddenSize}u;
-
- @group(0) @binding(0) var qkv: array<${dataType}>;
- @group(0) @binding(1) var bias: array<${dataType}>;
- @group(0) @binding(2) var qkv_with_bias: array<${dataType}>;
-
+ const programUniforms: ProgramUniform[] =
+ [{type: 'uint32', data: outputSize}, {type: 'uint32', data: biasOffset}, {type: 'uint32', data: hiddenSize}];
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const output = outputVariable('qkv_with_bias', qkv.dataType, outputShape);
+ const qkvInput = inputVariable('qkv', qkv.dataType, outputShape);
+ const biasInput = inputVariable('bias', bias.dataType, outputShape);
+
+ const uniforms: UniformsArrayType = [
+ {name: 'output_size', type: 'u32'}, {name: 'bias_offset', type: 'u32'}, {name: 'hidden_size', type: 'u32'}
+ ];
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(qkvInput, biasInput, output)}
${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
- let biasOffsetIdx = (global_idx % hiddenSize) + biasOffset;
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+ let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
- qkv_with_bias[global_idx] = qkv[global_idx] + bias[biasOffsetIdx];
+ qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
}`;
+ };
return context.compute(
{
name: 'MultiHeadAttentionAddBias',
- shaderCache: {hint: JSON.stringify({batchSize, sequenceLength, hiddenSize, biasOffset})},
+ shaderCache: {inputDependencies: ['type', 'type']},
getRunData: () => ({
outputs: [{dims: outputShape, dataType: qkv.dataType, gpuDataType: GpuDataType.default}],
dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+ programUniforms
}),
getShaderSource,
},
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index 18859e253aa02..eca3fa7d944bb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -1,15 +1,14 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
-import {DataType} from '../../../wasm-common';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
import {TensorView} from '../../tensor-view';
import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
-import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getElementAt, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
-export interface PadAttributes extends AttributeWithCacheKey {
+interface PadAttributes {
// 0-constant, 1-reflect, 2-edge, 3-wrap
readonly mode: number;
readonly value: number;
@@ -35,27 +34,23 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
}
};
-const getPadConstant =
- (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[],
- dataType: string, constantValue: number): string => {
- const inputRank = inputDims.length;
-
- let block = '';
- for (let i = inputRank - 1; i >= 0; --i) {
- block += `
- k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadConstant = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+ let block = '';
+ for (let i = inputRank - 1; i >= 0; --i) {
+ block += `
+ k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
if (k < 0) {
break;
}
- if (k >= ${inputDims[i]}) {
+ if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
break;
}
- offset += k * ${inputStrides[i]};
+ offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
`;
- }
+ }
- return `
- value = ${dataType}(${constantValue});
+ return `
+ value = ${output.type.value}(uniforms.constant_value);
for (var i = 0; i < 1; i++) {
var offset = 0;
var k = 0;
@@ -63,143 +58,143 @@ const getPadConstant =
value = x[offset];
}
`;
- };
-
-const getPadReflect =
- (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
- const inputRank = inputDims.length;
+};
- let block = '';
- for (let i = inputRank - 1; i >= 0; --i) {
- block += `
- k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadReflect = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+ let block = '';
+ for (let i = inputRank - 1; i >= 0; --i) {
+ block += `
+ k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
if (k < 0) {
k = -k;
}
{
- let _2n_1 = ${2 * (inputDims[i] - 1)};
+ let _2n_1 = 2 * (i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1);
k = k % _2n_1;
- if(k >= ${inputDims[i]}) {
+ if(k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
k = _2n_1 - k;
}
}
- offset += k * ${inputStrides[i]};
+ offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
`;
- }
+ }
- return `
+ return `
var offset = 0;
var k = 0;
${block}
value = x[offset];
`;
- };
-
-const getPadEdge =
- (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
- const inputRank = inputDims.length;
+};
- let block = '';
- for (let i = inputRank - 1; i >= 0; --i) {
- block += `
- k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadEdge = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+ let block = '';
+ for (let i = inputRank - 1; i >= 0; --i) {
+ block += `
+ k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
if (k < 0) {
k = 0;
}
- if (k >= ${inputDims[i]}) {
- k = ${inputDims[i] - 1};
+ if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+ k = i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1;
}
- offset += k * ${inputStrides[i]};
+ offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
`;
- }
+ }
- return `
+ return `
var offset = 0;
var k = 0;
${block}
value = x[offset];
`;
- };
-
-const getPadWrap =
- (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
- const inputRank = inputDims.length;
+};
- let block = '';
- for (let i = inputRank - 1; i >= 0; --i) {
- block += `
- k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadWrap = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+ let block = '';
+ for (let i = inputRank - 1; i >= 0; --i) {
+ block += `
+ k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
if (k < 0) {
- k += ${inputDims[i]};
+ k += i32(${getElementAt('uniforms.x_shape', i, inputRank)}]);
}
- if (k >= ${inputDims[i]}) {
- k -= ${inputDims[i]};
+ if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+ k -= i32(${getElementAt('uniforms.x_shape', i, inputRank)});
}
- offset += k * ${inputStrides[i]};
+ offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
`;
- }
+ }
- return `
+ return `
var offset = 0;
var k = 0;
${block}
value = x[offset];
`;
- };
-
-const getPadSnippet =
- (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], attributes: PadAttributes,
- dataType: string): string => {
- switch (attributes.mode) {
- case 0:
- return getPadConstant(output, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
- case 1:
- return getPadReflect(output, inputDims, inputStrides, attributes.pads);
- case 2:
- return getPadEdge(output, inputDims, inputStrides, attributes.pads);
- case 3:
- return getPadWrap(output, inputDims, inputStrides, attributes.pads);
- default:
- throw new Error('Invalid mode');
- }
- };
-
-const generatePadCode =
- (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
- string => {
- const inputDims = inputs[0].dims;
- const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
- const outputSize = ShapeUtil.size(outputDims);
- const inputStrides = ShapeUtil.computeStrides(inputDims);
-
- const output = outputVariable('output', inputs[0].dataType, outputDims);
- const input = inputVariable('x', inputs[0].dataType, inputDims);
-
- const padSnippet = getPadSnippet(output, inputDims, inputStrides, attributes, dataType);
- const padCode = `
- ${shaderHelper.declareVariables(input, output)}
- ${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-
- let indices = ${output.offsetToIndices('global_idx')};
-
- var value = ${dataType}(0);
- ${padSnippet}
- output[global_idx] = value;
- }`;
- return padCode;
- };
+};
+
+const getPadSnippet = (output: IndicesHelper, inputRank: number, attributes: PadAttributes): string => {
+ switch (attributes.mode) {
+ case 0:
+ return getPadConstant(output, inputRank, attributes.pads.length);
+ case 1:
+ return getPadReflect(output, inputRank, attributes.pads.length);
+ case 2:
+ return getPadEdge(output, inputRank, attributes.pads.length);
+ case 3:
+ return getPadWrap(output, inputRank, attributes.pads.length);
+ default:
+ throw new Error('Invalid mode');
+ }
+};
const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfo => {
const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+ const inputDims = inputs[0].dims;
+ const outputSize = ShapeUtil.size(outputShape);
+ const programUniforms: ProgramUniform[] =
+ [{type: 'uint32', data: outputSize}, {type: 'uint32', data: attributes.pads}];
+ if (attributes.mode === 0) {
+ const tensorDataType = tensorDataTypeEnumToString(inputs[0].dataType) as ProgramUniform['type'];
+ programUniforms.push({type: tensorDataType, data: attributes.value});
+ }
+
+ programUniforms.push(...createTensorShapeVariables(inputs[0].dims), ...createTensorShapeVariables(outputShape));
+ const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
+
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+ const input = inputVariable('x', inputs[0].dataType, inputDims.length);
+ const dataType = input.type.value;
+ const padSnippet = getPadSnippet(output, inputDims.length, attributes);
+ const uniforms: UniformsArrayType =
+ [{name: 'output_size', type: 'u32'}, {name: 'pads', type: 'i32', length: attributes.pads.length}];
+ if (attributes.mode === 0) {
+ uniforms.push({name: 'constant_value', type: dataType as UniformDataElementType});
+ }
+
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(input, output)}
+ ${shaderHelper.mainStart()}
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+
+ let indices = ${output.offsetToIndices('global_idx')};
+
+ var value = ${dataType}(0);
+ ${padSnippet}
+ output[global_idx] = value;
+ }`;
+ };
+
return {
name: 'Pad',
- shaderCache: {hint: attributes.cacheKey},
+ shaderCache: {hint: `${attributes.mode}`, inputDependencies},
getRunData: () => ({
outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
- dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)}
+ dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)},
+ programUniforms
}),
- getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+ getShaderSource,
};
};
@@ -223,7 +218,7 @@ const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes
const pads: number[] = [];
updatePads.forEach(v => pads.push(v));
- return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+ return {mode: attributes.mode, value, pads};
} else {
return attributes;
}
@@ -234,10 +229,3 @@ export const pad = (context: ComputeContext, attributes: PadAttributes): void =>
const updatedAttributes = createPadAttributesFromInputs(context.inputs, attributes);
context.compute(createPadProgramInfo(context.inputs, updatedAttributes), {inputs: [0]});
};
-
-export const parsePadAttributes = (attributes: Record): PadAttributes => {
- const mode = attributes.mode as number;
- const value = attributes.value as number;
- const pads = attributes.pads as number[];
- return createAttributeWithCacheKey({mode, value, pads});
-};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/range.ts b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
index 9cf66111bf707..ed04b0f94bc57 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/range.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
@@ -3,10 +3,10 @@
import {env} from 'onnxruntime-common';
-import {DataType} from '../../../wasm-common';
-import {ComputeContext, ProgramInfo} from '../types';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
-import {outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
const validateInputsContent = (start: number, limit: number, delta: number): void => {
const sameStartLimit = start === limit;
@@ -22,23 +22,36 @@ const createRangeProgramInfo = (start: number, limit: number, delta: number, dat
const numElements = Math.abs(Math.ceil((limit - start) / delta));
const outputShape: number[] = [numElements];
const outputSize = numElements;
+ const tensorDataType = tensorDataTypeEnumToString(dataType) as ProgramUniform['type'];
+ const programUniforms: ProgramUniform[] = [
+ {type: 'uint32', data: outputSize}, {type: tensorDataType, data: start}, {type: tensorDataType, data: delta},
+ ...createTensorShapeVariables(outputShape)
+ ];
- const output = outputVariable('output', dataType, outputShape);
- const wgslType = output.type.storage;
-
- const getShaderSource = (shaderHelper: ShaderHelper) => `
- ${shaderHelper.declareVariables(output)}
+ const getShaderSource = (shaderHelper: ShaderHelper) => {
+ const output = outputVariable('output', dataType, outputShape.length);
+ const wgslType = output.type.value;
+ const uniforms: UniformsArrayType = [
+ {name: 'outputSize', type: 'u32'}, {name: 'start', type: wgslType as UniformDataElementType},
+ {name: 'delta', type: wgslType as UniformDataElementType}
+ ];
+ return `
+ ${shaderHelper.registerUniforms(uniforms).declareVariables(output)}
${shaderHelper.mainStart()}
- ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
- output[global_idx] = ${wgslType}(${start}) + ${wgslType}(global_idx) * ${wgslType}(${delta});
+ ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')}
+ output[global_idx] = uniforms.start + ${wgslType}(global_idx) * uniforms.delta;
}`;
+ };
+
return {
name: 'Range',
- shaderCache: {hint: [start, limit, delta].map(x => x.toString()).join('_')},
+ shaderCache: {hint: `${dataType}`},
getShaderSource,
- getRunData: () => (
- {outputs: [{dims: outputShape, dataType}],
- dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}})
+ getRunData: () => ({
+ outputs: [{dims: outputShape, dataType}],
+ dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+ programUniforms
+ })
};
};
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 5151f27582c1f..a52ac4454a5c1 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -44,6 +44,7 @@ const SOURCE_ROOT_FOLDER = path.join(__dirname, '../..'); // /js/
const DEFAULT_DEFINE = {
'BUILD_DEFS.DISABLE_WEBGL': 'false',
'BUILD_DEFS.DISABLE_WEBGPU': 'false',
+ 'BUILD_DEFS.DISABLE_WEBNN': 'false',
'BUILD_DEFS.DISABLE_WASM': 'false',
'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
'BUILD_DEFS.DISABLE_WASM_THREAD': 'false',
@@ -359,6 +360,7 @@ async function main() {
...DEFAULT_DEFINE,
'BUILD_DEFS.DISABLE_WEBGPU': 'true',
'BUILD_DEFS.DISABLE_WEBGL': 'true',
+ 'BUILD_DEFS.DISABLE_WEBNN': 'true',
'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
},
@@ -367,10 +369,7 @@ async function main() {
if (BUNDLE_MODE === 'dev') {
// ort.all.js
- await addBuildTask(buildOrt({
- outputBundleName: 'ort.all',
- format: 'iife',
- }));
+ await addBuildTask(buildOrt({outputBundleName: 'ort.all', format: 'iife', define: {...DEFAULT_DEFINE}}));
}
if (BUNDLE_MODE === 'perf') {
@@ -394,7 +393,7 @@ async function main() {
// ort.webgpu[.min].js
await addAllWebBuildTasks({
outputBundleName: 'ort.webgpu',
- define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'},
+ define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_WEBNN': 'true'},
});
// ort.wasm[.min].js
await addAllWebBuildTasks({
@@ -404,7 +403,12 @@ async function main() {
// ort.webgl[.min].js
await addAllWebBuildTasks({
outputBundleName: 'ort.webgl',
- define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WASM': 'true'},
+ define: {
+ ...DEFAULT_DEFINE,
+ 'BUILD_DEFS.DISABLE_WEBGPU': 'true',
+ 'BUILD_DEFS.DISABLE_WASM': 'true',
+ 'BUILD_DEFS.DISABLE_WEBNN': 'true',
+ },
});
// ort.wasm-core[.min].js
await addAllWebBuildTasks({
@@ -413,6 +417,7 @@ async function main() {
...DEFAULT_DEFINE,
'BUILD_DEFS.DISABLE_WEBGPU': 'true',
'BUILD_DEFS.DISABLE_WEBGL': 'true',
+ 'BUILD_DEFS.DISABLE_WEBNN': 'true',
'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
},
@@ -425,6 +430,7 @@ async function main() {
'BUILD_DEFS.DISABLE_TRAINING': 'false',
'BUILD_DEFS.DISABLE_WEBGPU': 'true',
'BUILD_DEFS.DISABLE_WEBGL': 'true',
+ 'BUILD_DEFS.DISABLE_WEBNN': 'true',
},
});
}
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index ee955ec8d4f17..fc74adfed1fee 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -79,6 +79,7 @@ Options:
--webgl-texture-cache-mode Set the WebGL texture cache mode (initializerOnly/full)
--webgl-texture-pack-mode Set the WebGL texture pack mode (true/false)
--webgpu-profiling-mode Set the WebGPU profiling mode (off/default)
+ --webnn-device-type Set the WebNN device type (cpu/gpu)
*** Browser Options ***
@@ -174,6 +175,7 @@ export interface TestRunnerCliArgs {
cudaFlags?: Record;
wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+ webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
globalEnvFlags?: Test.Options['globalEnvFlags'];
noSandbox?: boolean;
chromiumFlags: string[];
@@ -335,6 +337,14 @@ function parseWebgpuFlags(args: minimist.ParsedArgs): Partial {
return {profilingMode, validateInputContent};
}
+function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption {
+ const deviceType = args['webnn-device-type'];
+ if (deviceType !== undefined && deviceType !== 'cpu' && deviceType !== 'gpu') {
+ throw new Error('Flag "webnn-device-type" is invalid');
+ }
+ return {name: 'webnn', deviceType};
+}
+
function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable {
const wasm = parseWasmFlags(args);
const webgl = parseWebglFlags(args);
@@ -449,6 +459,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
const wasmOptions = parseWasmOptions(args);
const webglOptions = parseWebglOptions(args);
+ const webnnOptions = parseWebNNOptions(args);
// Option: --no-sandbox
const noSandbox = !!args['no-sandbox'];
@@ -487,6 +498,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
fileCache,
cpuOptions,
webglOptions,
+ webnnOptions,
wasmOptions,
globalEnvFlags,
noSandbox,
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 74a03290332a8..d56792c6e3595 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -165,6 +165,7 @@ async function main() {
debug: args.debug,
cpuOptions: args.cpuOptions,
webglOptions: args.webglOptions,
+ webnnOptions: args.webnnOptions,
wasmOptions: args.wasmOptions,
globalEnvFlags: args.globalEnvFlags
}
@@ -499,7 +500,7 @@ async function main() {
args.bundleMode === 'perf' ? 'perf' :
args.debug ? 'debug' :
'test',
- webgpu, webnn);
+ webgpu);
const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
if (args.debug) {
@@ -614,11 +615,10 @@ async function main() {
fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
}
- function getBrowserNameFromEnv(
- env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+ function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
switch (env) {
case 'chrome':
- return selectChromeBrowser(mode, webgpu, webnn);
+ return selectChromeBrowser(mode, webgpu);
case 'edge':
return 'EdgeTest';
case 'firefox':
@@ -634,10 +634,8 @@ async function main() {
}
}
- function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
- if (webnn) {
- return 'ChromeCanaryTest';
- } else if (webgpu) {
+ function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
+ if (webgpu) {
return 'ChromeTest';
} else {
switch (mode) {
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 2e8eaaba191d0..cc10df5864233 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -298,7 +298,157 @@
}
]
},
-
+ {
+ "name": "conv - vectorize group - A",
+ "operator": "Conv",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "attributes": [
+ { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+ { "name": "group", "data": 2, "type": "int" }
+ ],
+ "cases": [
+ {
+ "name": "T[0]",
+ "inputs": [
+ {
+ "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0],
+ "dims": [1, 2, 3, 3],
+ "type": "float32"
+ },
+ {
+ "data": [1.0, 2.0],
+ "dims": [2, 1, 1, 1],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+ "dims": [1, 2, 3, 3],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "conv - vectorize group - B",
+ "operator": "Conv",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "attributes": [
+ { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+ { "name": "group", "data": 3, "type": "int" }
+ ],
+ "cases": [
+ {
+ "name": "T[0]",
+ "inputs": [
+ {
+ "data": [
+ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+ 19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+ ],
+ "dims": [1, 3, 3, 3],
+ "type": "float32"
+ },
+ {
+ "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+ "dims": [3, 1, 2, 2],
+ "type": "float32"
+ },
+ {
+ "data": [0.1, 0.2, 0.3],
+ "dims": [3],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+ "dims": [1, 3, 2, 2],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "conv - vectorize group - C",
+ "operator": "Conv",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "attributes": [
+ { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+ { "name": "group", "data": 3, "type": "int" }
+ ],
+ "cases": [
+ {
+ "name": "T[0]",
+ "inputs": [
+ {
+ "data": [
+ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+ 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+ ],
+ "dims": [1, 3, 3, 4],
+ "type": "float32"
+ },
+ {
+ "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+ "dims": [3, 1, 2, 2],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374],
+ "dims": [1, 3, 2, 3],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "conv - vectorize group - D",
+ "operator": "Conv",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "attributes": [
+ { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+ { "name": "group", "data": 3, "type": "int" },
+ { "name": "strides", "data": [2, 2], "type": "ints" }
+ ],
+ "cases": [
+ {
+ "name": "T[0] strides = [2, 2]",
+ "inputs": [
+ {
+ "data": [
+ 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+ 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+ ],
+ "dims": [1, 3, 3, 4],
+ "type": "float32"
+ },
+ {
+ "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+ "dims": [3, 1, 2, 2],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [34, 54, 386, 438, 1122, 1206],
+ "dims": [1, 3, 1, 2],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
{
"name": "conv - pointwise",
"operator": "Conv",
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index 6a4e6912405ee..e89ac2da3795f 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -38,6 +38,79 @@
}
]
},
+ {
+ "name": "Simple test with NHWC, components 1",
+ "operator": "InstanceNormalization",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "cases": [
+ {
+ "name": "Simple test",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+ "dims": [1, 5, 3, 1],
+ "type": "float32"
+ },
+ {
+ "data": [1, 2, 3, 4, 5],
+ "dims": [5],
+ "type": "float32"
+ },
+ {
+ "data": [4, 5, 6, 7, 8],
+ "dims": [5],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+ 9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+ ],
+ "dims": [1, 5, 3, 1],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "Simple test with NHWC, components 2",
+ "operator": "InstanceNormalization",
+ "inputShapeDefinitions": "rankOnly",
+ "opset": { "domain": "", "version": 17 },
+ "cases": [
+ {
+ "name": "Simple test",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8],
+ "dims": [2, 6, 1, 1],
+ "type": "float32"
+ },
+ {
+ "data": [1, 2, 3, 4, 5, 6],
+ "dims": [6],
+ "type": "float32"
+ },
+ {
+ "data": [4, 5, 6, 7, 8, 9],
+ "dims": [6],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9],
+ "dims": [2, 6, 1, 1],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
{
"name": "Simple test with NCHW",
"operator": "InstanceNormalization",
@@ -75,5 +148,81 @@
]
}
]
+ },
+ {
+ "name": "Simple test with NCHW, components 1",
+ "operator": "InstanceNormalization",
+ "opset": { "domain": "", "version": 17 },
+ "cases": [
+ {
+ "name": "Simple test",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+ "dims": [1, 5, 3, 1],
+ "type": "float32"
+ },
+ {
+ "data": [1, 2, 3, 4, 5],
+ "dims": [5],
+ "type": "float32"
+ },
+ {
+ "data": [4, 5, 6, 7, 8],
+ "dims": [5],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+ 9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+ ],
+ "dims": [1, 5, 3, 1],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "Simple test with NCHW, components 2",
+ "operator": "InstanceNormalization",
+ "opset": { "domain": "", "version": 17 },
+ "cases": [
+ {
+ "name": "Simple test",
+ "inputs": [
+ {
+ "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+ "dims": [1, 3, 6, 1],
+ "type": "float32"
+ },
+ {
+ "data": [1, 2, 3],
+ "dims": [3],
+ "type": "float32"
+ },
+ {
+ "data": [4, 5, 6],
+ "dims": [3],
+ "type": "float32"
+ }
+ ],
+ "outputs": [
+ {
+ "data": [
+ 2.5361523628234863, 3.1216912269592285, 3.70723032951355, 4.292769432067871, 4.878308296203613,
+ 5.4638471603393555, 1.8666191101074219, 3.9555397033691406, 6.044460296630859, 8.133380889892578,
+ 6.044460296630859, 3.9555397033691406, 10.3915433883667, 8.634925842285156, 6.878308296203613,
+ 5.121691703796387, 3.365074634552002, 1.6084575653076172
+ ],
+ "dims": [1, 3, 6, 1],
+ "type": "float32"
+ }
+ ]
+ }
+ ]
}
]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 594ce9feed31e..79f42e36bf390 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1501,85 +1501,424 @@
"webnn": {
"onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
"node": [
- // Check in node tests that have native Wasm implementations.
- // (i.e.) not tests that rely on the fallback cpu implementations.
- // Use the 'cpu' level of node tests to test those implementations.
+ "test_abs",
+ "test_acos_example",
+ "test_acos",
+ "test_acosh_example",
+ "test_acosh",
+ // // "test_adagrad_multiple",
+ // // "test_adagrad",
+ // // "test_adam_multiple",
+ // // "test_adam",
"test_add_bcast",
+ // "test_add_uint8",
"test_add",
- "test_sub_bcast",
- "test_sub_example",
- "test_sub",
- "test_mul_bcast",
- "test_mul_example",
- "test_mul",
- "test_div_bcast",
- "test_div_example",
- "test_div",
- "test_xor_bcast3v1d",
- "test_xor_bcast3v2d",
- "test_xor_bcast4v2d",
- "test_xor_bcast4v3d",
- "test_xor_bcast4v4d",
- "test_xor2d",
- "test_xor3d",
- "test_xor4d",
- "test_or_bcast3v1d",
- "test_or_bcast3v2d",
- "test_or_bcast4v2d",
- "test_or_bcast4v3d",
- "test_or_bcast4v4d",
- "test_and_bcast3v1d",
- "test_and_bcast3v2d",
- "test_and_bcast4v2d",
- "test_and_bcast4v3d",
- "test_and_bcast4v4d",
- "test_and2d",
- "test_and3d",
- "test_and4d",
- "test_prelu_broadcast",
- "test_prelu_example",
+ // "test_and_bcast3v1d",
+ // "test_and_bcast3v2d",
+ // "test_and_bcast4v2d",
+ // "test_and_bcast4v3d",
+ // "test_and_bcast4v4d",
+ // "test_and2d",
+ // "test_and3d",
+ // "test_and4d",
+ // "test_argmax_default_axis_example_select_last_index",
+ // "test_argmax_default_axis_example",
+ // "test_argmax_default_axis_random_select_last_index",
+ // "test_argmax_default_axis_random",
+ // "test_argmax_keepdims_example_select_last_index",
+ // "test_argmax_keepdims_example",
+ // "test_argmax_keepdims_random_select_last_index",
+ // "test_argmax_keepdims_random",
+ // "test_argmax_negative_axis_keepdims_example_select_last_index",
+ // "test_argmax_negative_axis_keepdims_example",
+ // "test_argmax_negative_axis_keepdims_random_select_last_index",
+ // "test_argmax_negative_axis_keepdims_random",
+ // "test_argmax_no_keepdims_example_select_last_index",
+ // "test_argmax_no_keepdims_example",
+ // "test_argmax_no_keepdims_random_select_last_index",
+ // "test_argmax_no_keepdims_random",
+ // "test_argmin_default_axis_example_select_last_index",
+ // "test_argmin_default_axis_example",
+ // "test_argmin_default_axis_random_select_last_index",
+ // "test_argmin_default_axis_random",
+ // "test_argmin_keepdims_example_select_last_index",
+ // "test_argmin_keepdims_example",
+ // "test_argmin_keepdims_random_select_last_index",
+ // "test_argmin_keepdims_random",
+ // "test_argmin_negative_axis_keepdims_example_select_last_index",
+ // "test_argmin_negative_axis_keepdims_example",
+ // "test_argmin_negative_axis_keepdims_random_select_last_index",
+ // "test_argmin_negative_axis_keepdims_random",
+ // "test_argmin_no_keepdims_example_select_last_index",
+ // "test_argmin_no_keepdims_example",
+ // "test_argmin_no_keepdims_random_select_last_index",
+ // "test_argmin_no_keepdims_random",
+ // "test_asin_example",
+ // "test_asin",
+ // "test_asinh_example",
+ // "test_asinh",
+ // "test_atan_example",
+ // "test_atan",
+ // "test_atanh_example",
+ // "test_atanh",
+ // "test_averagepool_1d_default",
+ // "test_averagepool_2d_ceil",
+ "test_averagepool_2d_default",
+ "test_averagepool_2d_pads_count_include_pad",
+ "test_averagepool_2d_pads",
+ "test_averagepool_2d_precomputed_pads_count_include_pad",
+ "test_averagepool_2d_precomputed_pads",
+ "test_averagepool_2d_precomputed_same_upper",
+ "test_averagepool_2d_precomputed_strides",
+ "test_averagepool_2d_same_lower",
+ "test_averagepool_2d_same_upper",
+ "test_averagepool_2d_strides",
+ // "test_averagepool_3d_default",
"test_basic_conv_with_padding",
"test_basic_conv_without_padding",
- "test_batchnorm_epsilon",
- "test_batchnorm_example",
- "opset{10,11,12}/test_cast_STRING_to_FLOAT",
- "test_clip_splitbounds",
- "test_clip_outbounds",
- "test_clip_inbounds",
- "test_clip_example",
- "test_clip_default_min",
- "test_clip_default_max",
+ // "test_basic_convinteger",
+ // "test_batchnorm_epsilon_training_mode",
+ // "test_batchnorm_epsilon",
+ // "test_batchnorm_example_training_mode",
+ // "test_batchnorm_example",
+ // // "test_bernoulli_double_expanded",
+ // // "test_bernoulli_double",
+ // // "test_bernoulli_expanded",
+ // // "test_bernoulli_seed_expanded",
+ // // "test_bernoulli_seed",
+ // // "test_bernoulli",
+ // // "test_bitshift_left_uint16",
+ // // "test_bitshift_left_uint32",
+ // // "test_bitshift_left_uint64",
+ // // "test_bitshift_left_uint8",
+ // // "test_bitshift_right_uint16",
+ // // "test_bitshift_right_uint32",
+ // // "test_bitshift_right_uint64",
+ // // "test_bitshift_right_uint8",
+ // // "test_blackmanwindow_expanded",
+ // // "test_blackmanwindow_symmetric_expanded",
+ // // "test_blackmanwindow_symmetric",
+ // // "test_blackmanwindow",
+ // // "test_cast_BFLOAT16_to_FLOAT",
+ // // "test_cast_DOUBLE_to_FLOAT",
+ // // "test_cast_DOUBLE_to_FLOAT16",
+ // // "test_cast_FLOAT_to_BFLOAT16",
+ // // "test_cast_FLOAT_to_DOUBLE",
+ // // "test_cast_FLOAT_to_FLOAT16",
+ // // "test_cast_FLOAT_to_STRING",
+ // // "test_cast_FLOAT16_to_DOUBLE",
+ // // "test_cast_FLOAT16_to_FLOAT",
+ // // "test_cast_STRING_to_FLOAT",
+ // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
+ // // "test_castlike_BFLOAT16_to_FLOAT",
+ // // "test_castlike_DOUBLE_to_FLOAT_expanded",
+ // // "test_castlike_DOUBLE_to_FLOAT",
+ // // "test_castlike_DOUBLE_to_FLOAT16_expanded",
+ // // "test_castlike_DOUBLE_to_FLOAT16",
+ // // "test_castlike_FLOAT_to_BFLOAT16_expanded",
+ // // "test_castlike_FLOAT_to_BFLOAT16",
+ // // "test_castlike_FLOAT_to_DOUBLE_expanded",
+ // // "test_castlike_FLOAT_to_DOUBLE",
+ // // "test_castlike_FLOAT_to_FLOAT16_expanded",
+ // // "test_castlike_FLOAT_to_FLOAT16",
+ // // "test_castlike_FLOAT_to_STRING_expanded",
+ // // "test_castlike_FLOAT_to_STRING",
+ // // "test_castlike_FLOAT16_to_DOUBLE_expanded",
+ // // "test_castlike_FLOAT16_to_DOUBLE",
+ // // "test_castlike_FLOAT16_to_FLOAT_expanded",
+ // // "test_castlike_FLOAT16_to_FLOAT",
+ // // "test_castlike_STRING_to_FLOAT_expanded",
+ // // "test_castlike_STRING_to_FLOAT",
+ "test_ceil_example",
+ "test_ceil",
+ // "test_celu_expanded",
+ // "test_celu",
"test_clip_default_inbounds",
+ "test_clip_default_int8_inbounds",
+ "test_clip_default_int8_max",
+ "test_clip_default_int8_min",
+ "test_clip_default_max",
+ "test_clip_default_min",
+ "test_clip_example",
+ "test_clip_inbounds",
+ "test_clip_outbounds",
+ "test_clip_splitbounds",
"test_clip",
+ // // "test_compress_0",
+ // // "test_compress_1",
+ // // "test_compress_default_axis",
+ // // "test_compress_negative_axis",
+ "test_concat_1d_axis_0",
+ "test_concat_1d_axis_negative_1",
+ "test_concat_2d_axis_0",
+ "test_concat_2d_axis_1",
+ "test_concat_2d_axis_negative_1",
+ "test_concat_2d_axis_negative_2",
+ "test_concat_3d_axis_0",
+ "test_concat_3d_axis_1",
+ "test_concat_3d_axis_2",
+ "test_concat_3d_axis_negative_1",
+ "test_concat_3d_axis_negative_2",
+ "test_concat_3d_axis_negative_3",
+ "test_conv_with_autopad_same",
"test_conv_with_strides_and_asymmetric_padding",
"test_conv_with_strides_no_padding",
"test_conv_with_strides_padding",
- "test_gemm_nobroadcast",
+ // // "test_convinteger_with_padding",
+ // // "test_convinteger_without_padding",
+ // "test_convtranspose_1d",
+ // // "test_convtranspose_3d",
+ // "test_convtranspose_autopad_same",
+ "test_convtranspose_dilations",
+ "test_convtranspose_kernel_shape",
+ "opset{9,17}/test_convtranspose_output_shape",
+ "test_convtranspose_pad",
+ "test_convtranspose_pads",
+ "test_convtranspose_with_kernel",
+ "test_convtranspose",
+ "test_cos_example",
+ "test_cos",
+ // "test_cosh_example",
+ // "test_cosh",
+ // "test_cumsum_1d_exclusive",
+ // "test_cumsum_1d_reverse_exclusive",
+ // "test_cumsum_1d_reverse",
+ // "test_cumsum_1d",
+ // "test_cumsum_2d_axis_0",
+ // "test_cumsum_2d_axis_1",
+ // "test_cumsum_2d_negative_axis",
+ // "test_depthtospace_crd_mode_example",
+ // "test_depthtospace_crd_mode",
+ // "test_depthtospace_dcr_mode",
+ // "test_depthtospace_example",
+ // "test_depthtospace",
+ // // "test_dequantizelinear_axis",
+ // // "test_dequantizelinear",
+ // // "test_det_2d",
+ // // "test_det_nd",
+ // // "test_dft_axis",
+ // // "test_dft_inverse",
+ // // "test_dft",
+ "test_div_bcast",
+ "test_div_example",
+ // "test_div_uint8",
+ "test_div",
+ // // "test_dropout_default_mask_ratio",
+ // // "test_dropout_default_mask",
+ // // "test_dropout_default_old",
+ // // "test_dropout_default_ratio",
+ // // "test_dropout_default",
+ // // "test_dropout_random_old",
+ // // "test_dropout_random",
+ // // "test_dynamic_slice_default_axes",
+ // // "test_dynamic_slice_end_out_of_bounds",
+ // // "test_dynamic_slice_neg",
+ // // "test_dynamic_slice_start_out_of_bounds",
+ // // "test_dynamic_slice",
+ // // "test_dynamicquantizelinear_expanded",
+ // // "test_dynamicquantizelinear_max_adjusted_expanded",
+ // // "test_dynamicquantizelinear_max_adjusted",
+ // // "test_dynamicquantizelinear_min_adjusted_expanded",
+ // // "test_dynamicquantizelinear_min_adjusted",
+ // // "test_dynamicquantizelinear",
+ // "test_edge_pad",
+ // "test_einsum_batch_diagonal",
+ // "test_einsum_batch_matmul",
+ // "test_einsum_inner_prod",
+ // "test_einsum_sum",
+ // "test_einsum_transpose",
+ "test_elu_default",
+ "test_elu_example",
+ "test_elu",
+ // "test_equal_bcast",
+ // "test_equal",
+ // "test_erf",
+ "test_exp_example",
+ "test_exp",
+ // "test_expand_dim_changed",
+ // "test_expand_dim_unchanged",
+ // "test_eyelike_populate_off_main_diagonal",
+ // "test_eyelike_with_dtype",
+ // "test_eyelike_without_dtype",
+ "test_flatten_axis0",
+ "test_flatten_axis1",
+ "test_flatten_axis2",
+ "test_flatten_axis3",
+ "test_flatten_default_axis",
+ "test_flatten_negative_axis1",
+ "test_flatten_negative_axis2",
+ "test_flatten_negative_axis3",
+ "test_flatten_negative_axis4",
+ "test_floor_example",
+ "test_floor",
+ // "test_gather_0",
+ // "test_gather_1",
+ // "test_gather_2d_indices",
+ // "test_gather_negative_indices",
+ // "test_gather_elements_0",
+ // "test_gather_elements_1",
+ // "test_gather_elements_negative_indices",
+ // "test_gather_negative_indices",
+ // "test_gathernd_example_float32",
+ // "test_gathernd_example_int32_batch_dim1",
+ // "test_gathernd_example_int32",
+ "test_gemm_all_attributes",
+ "test_gemm_alpha",
+ "test_gemm_beta",
"test_gemm_broadcast",
- "test_matmul_2d",
- "test_matmul_3d",
- "test_matmul_4d",
- "test_softmax_axis_0",
- "test_softmax_axis_1",
- "test_softmax_axis_2",
- "test_softmax_default_axis",
- "test_softmax_example",
- "test_softmax_large_number",
- "test_sum_example",
- "test_sum_one_input",
- "test_sum_two_inputs",
- "test_averagepool_1d_default",
- "test_averagepool_2d_default",
- "test_averagepool_2d_pads",
- "test_averagepool_2d_precomputed_pads",
- "test_averagepool_2d_precomputed_same_upper",
- "test_averagepool_2d_precomputed_strides",
- "test_averagepool_2d_same_upper",
- "test_averagepool_2d_same_lower",
- "test_averagepool_2d_strides",
- "test_averagepool_3d_default",
- "test_maxpool_1d_default",
+ "test_gemm_default_matrix_bias",
+ "test_gemm_default_no_bias",
+ // "test_gemm_default_scalar_bias",
+ "test_gemm_default_single_elem_vector_bias",
+ "test_gemm_default_vector_bias",
+ "test_gemm_default_zero_bias",
+ "test_gemm_nobroadcast",
+ "test_gemm_transposeA",
+ "test_gemm_transposeB",
+ "test_globalaveragepool_precomputed",
+ "test_globalaveragepool",
+ // "test_globalmaxpool_precomputed",
+ // "test_globalmaxpool",
+ // "test_greater_bcast",
+ // "test_greater_equal_bcast_expanded",
+ // "test_greater_equal_bcast",
+ // "test_greater_equal_expanded",
+ // "test_greater_equal",
+ // "test_greater",
+ // // "test_gridsample_aligncorners_true",
+ // // "test_gridsample_bicubic",
+ // // "test_gridsample_bilinear",
+ // // "test_gridsample_border_padding",
+ // // "test_gridsample_nearest",
+ // // "test_gridsample_reflection_padding",
+ // // "test_gridsample_zeros_padding",
+ // // "test_gridsample",
+ // // "test_gru_batchwise",
+ // // "test_gru_defaults",
+ // // "test_gru_seq_length",
+ // // "test_gru_with_initial_bias",
+ // // "test_hammingwindow_expanded",
+ // // "test_hammingwindow_symmetric_expanded",
+ // // "test_hammingwindow_symmetric",
+ // // "test_hammingwindow",
+ // // "test_hannwindow_expanded",
+ // // "test_hannwindow_symmetric_expanded",
+ // // "test_hannwindow_symmetric",
+ // // "test_hannwindow",
+ // // "test_hardmax_axis_0",
+ // // "test_hardmax_axis_1",
+ // // "test_hardmax_axis_2",
+ // // "test_hardmax_default_axis",
+ // // "test_hardmax_example",
+ // // "test_hardmax_negative_axis",
+ // // "test_hardmax_one_hot",
+ // // "test_hardsigmoid_default",
+ // // "test_hardsigmoid_example",
+ // // "test_hardsigmoid",
+ // "test_hardswish_expanded",
+ "test_hardswish",
+ // "test_if",
+ // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra
+ // supports Sequence and Optional types
+ // "test_if_seq",
+ // "test_if_opt",
+ "test_instancenorm_epsilon",
+ // "test_instancenorm_example",
+ // "test_isinf_negative",
+ // "test_isinf_positive",
+ // "test_isinf",
+ // "test_isnan",
+ // "test_layer_normalization_2d_axis_negative_1_expanded",
+ // "test_layer_normalization_2d_axis_negative_1",
+ // "test_layer_normalization_2d_axis_negative_2_expanded",
+ // "test_layer_normalization_2d_axis_negative_2",
+ // "test_layer_normalization_2d_axis0_expanded",
+ // "test_layer_normalization_2d_axis0",
+ // "test_layer_normalization_2d_axis1_expanded",
+ // "test_layer_normalization_2d_axis1",
+ // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+ // "test_layer_normalization_3d_axis_negative_1_epsilon",
+ // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+ // "test_layer_normalization_3d_axis_negative_2_epsilon",
+ // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+ // "test_layer_normalization_3d_axis_negative_3_epsilon",
+ // // "test_layer_normalization_3d_axis0_epsilon_expanded",
+ // "test_layer_normalization_3d_axis0_epsilon",
+ // "test_layer_normalization_3d_axis1_epsilon_expanded",
+ // "test_layer_normalization_3d_axis1_epsilon",
+ // // "test_layer_normalization_3d_axis2_epsilon_expanded",
+ // "test_layer_normalization_3d_axis2_epsilon",
+ // "test_layer_normalization_4d_axis_negative_1_expanded",
+ // "test_layer_normalization_4d_axis_negative_1",
+ // // "test_layer_normalization_4d_axis_negative_2_expanded",
+ // "test_layer_normalization_4d_axis_negative_2",
+ // "test_layer_normalization_4d_axis_negative_3_expanded",
+ // "test_layer_normalization_4d_axis_negative_3",
+ // "test_layer_normalization_4d_axis_negative_4_expanded",
+ // "test_layer_normalization_4d_axis_negative_4",
+ // "test_layer_normalization_4d_axis0_expanded",
+ // "test_layer_normalization_4d_axis0",
+ // "test_layer_normalization_4d_axis1_expanded",
+ // "test_layer_normalization_4d_axis1",
+ // "test_layer_normalization_4d_axis2_expanded",
+ // "test_layer_normalization_4d_axis2",
+ // "test_layer_normalization_4d_axis3_expanded",
+ // "test_layer_normalization_4d_axis3",
+ // "test_layer_normalization_default_axis_expanded",
+ // "test_layer_normalization_default_axis",
+ "test_leakyrelu_default",
+ "test_leakyrelu_example",
+ "test_leakyrelu",
+ // "test_less_bcast",
+ // "test_less_equal_bcast_expanded",
+ // "test_less_equal_bcast",
+ // "test_less_equal_expanded",
+ // "test_less_equal",
+ // "test_less",
+ "test_log_example",
+ "test_log",
+ // // "test_logsoftmax_axis_0_expanded",
+ // // "test_logsoftmax_axis_0",
+ // // "test_logsoftmax_axis_1_expanded",
+ // // "test_logsoftmax_axis_1",
+ // // "test_logsoftmax_axis_2_expanded",
+ // // "test_logsoftmax_axis_2",
+ // // "test_logsoftmax_default_axis_expanded",
+ // // "test_logsoftmax_default_axis",
+ // // "test_logsoftmax_example_1_expanded",
+ // // "test_logsoftmax_example_1",
+ // // "test_logsoftmax_large_number_expanded",
+ // // "test_logsoftmax_large_number",
+ // // "test_logsoftmax_negative_axis_expanded",
+ // // "test_logsoftmax_negative_axis",
+ // "test_lrn_default",
+ // "test_lrn",
+ // // "test_lstm_batchwise",
+ // // "test_lstm_defaults",
+ // // "test_lstm_with_initial_bias",
+ // // "test_lstm_with_peepholes",
+ // "test_matmul_2d",
+ // "test_matmul_3d",
+ // "test_matmul_4d",
+ // // "test_matmulinteger",
+ // "test_max_example",
+ // "test_max_float16",
+ // "test_max_float32",
+ // "test_max_float64",
+ // "test_max_int16",
+ // "test_max_int32",
+ // "test_max_int64",
+ // "test_max_int8",
+ // "test_max_one_input",
+ // "test_max_two_inputs",
+ // "test_max_uint16",
+ // "test_max_uint32",
+ // "test_max_uint64",
+ // "test_max_uint8",
+ // "test_maxpool_1d_default",
+ // "test_maxpool_2d_ceil",
"test_maxpool_2d_default",
+ // "test_maxpool_2d_dilations",
"test_maxpool_2d_pads",
"test_maxpool_2d_precomputed_pads",
"test_maxpool_2d_precomputed_same_upper",
@@ -1587,13 +1926,622 @@
"test_maxpool_2d_same_lower",
"test_maxpool_2d_same_upper",
"test_maxpool_2d_strides",
- "test_maxpool_3d_default",
- "test_globalaveragepool_precomputed",
- "test_globalaveragepool",
- "test_globalmaxpool_precomputed",
- "test_globalmaxpool",
- "test_instancenorm_epsilon",
- "test_instancenorm_example"
+ // "test_maxpool_2d_uint8",
+ // "test_maxpool_3d_default",
+ // "test_maxpool_with_argmax_2d_precomputed_pads",
+ // "test_maxpool_with_argmax_2d_precomputed_strides",
+ // // "test_maxunpool_export_with_output_shape",
+ // // "test_maxunpool_export_without_output_shape",
+ // // "test_mean_example",
+ // // "test_mean_one_input",
+ // // "test_mean_two_inputs",
+ // // "test_melweightmatrix",
+ // "test_min_example",
+ // "test_min_float16",
+ // "test_min_float32",
+ // "test_min_float64",
+ // "test_min_int16",
+ // "test_min_int32",
+ // "test_min_int64",
+ // "test_min_int8",
+ // "test_min_one_input",
+ // "test_min_two_inputs",
+ // "test_min_uint16",
+ // "test_min_uint32",
+ // "test_min_uint64",
+ // "test_min_uint8",
+ // "test_mod_bcast",
+ // "test_mod_broadcast",
+ // "test_mod_float_mixed_sign_example",
+ // "test_mod_fmod_mixed_sign_example",
+ // "test_mod_int64_fmod",
+ // "test_mod_int64_mixed_sign_example",
+ // "test_mod_mixed_sign_float16",
+ // "test_mod_mixed_sign_float32",
+ // "test_mod_mixed_sign_float64",
+ // "test_mod_mixed_sign_int16",
+ // "test_mod_mixed_sign_int32",
+ // "test_mod_mixed_sign_int64",
+ // "test_mod_mixed_sign_int8",
+ // "test_mod_uint16",
+ // "test_mod_uint32",
+ // "test_mod_uint64",
+ // "test_mod_uint8",
+ // // "test_momentum_multiple",
+ // // "test_momentum",
+ "test_mul_bcast",
+ "test_mul_example",
+ // "test_mul_uint8",
+ "test_mul",
+ // "test_mvn_expanded",
+ // "test_mvn",
+ "test_neg_example",
+ "test_neg",
+ // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NC",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+ // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+ // // "test_nesterov_momentum",
+ // // "test_nllloss_NC_expanded",
+ // // "test_nllloss_NC",
+ // // "test_nllloss_NCd1_expanded",
+ // // "test_nllloss_NCd1_ii_expanded",
+ // // "test_nllloss_NCd1_ii",
+ // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded",
+ // // "test_nllloss_NCd1_mean_weight_negative_ii",
+ // // "test_nllloss_NCd1_weight_expanded",
+ // // "test_nllloss_NCd1_weight_ii_expanded",
+ // // "test_nllloss_NCd1_weight_ii",
+ // // "test_nllloss_NCd1_weight",
+ // // "test_nllloss_NCd1",
+ // // "test_nllloss_NCd1d2_expanded",
+ // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded",
+ // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii",
+ // // "test_nllloss_NCd1d2_reduction_mean_expanded",
+ // // "test_nllloss_NCd1d2_reduction_mean",
+ // // "test_nllloss_NCd1d2_reduction_sum_expanded",
+ // // "test_nllloss_NCd1d2_reduction_sum",
+ // // "test_nllloss_NCd1d2_with_weight_expanded",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_mean",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+ // // "test_nllloss_NCd1d2_with_weight_reduction_sum",
+ // // "test_nllloss_NCd1d2_with_weight",
+ // // "test_nllloss_NCd1d2",
+ // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+ // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+ // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+ // // "test_nllloss_NCd1d2d3_sum_weight_high_ii",
+ // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
+ // // "test_nllloss_NCd1d2d3d4d5_mean_weight",
+ // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+ // // "test_nllloss_NCd1d2d3d4d5_none_no_weight",
+ // "test_nonmaxsuppression_center_point_box_format",
+ // "test_nonmaxsuppression_flipped_coordinates",
+ // "test_nonmaxsuppression_identical_boxes",
+ // "test_nonmaxsuppression_limit_output_size",
+ // "test_nonmaxsuppression_single_box",
+ // "test_nonmaxsuppression_suppress_by_IOU_and_scores",
+ // "test_nonmaxsuppression_suppress_by_IOU",
+ // "test_nonmaxsuppression_two_batches",
+ // "test_nonmaxsuppression_two_classes",
+ // "test_nonzero_example",
+ // "test_not_2d",
+ // "test_not_3d",
+ // "test_not_4d",
+ // // "test_onehot_negative_indices",
+ // // "test_onehot_with_axis",
+ // // "test_onehot_with_negative_axis",
+ // // "test_onehot_without_axis",
+ // // "test_optional_get_element_sequence",
+ // // "test_optional_get_element",
+ // // "test_optional_has_element_empty",
+ // // "test_optional_has_element",
+ // "test_or_bcast3v1d",
+ // "test_or_bcast3v2d",
+ // "test_or_bcast4v2d",
+ // "test_or_bcast4v3d",
+ // "test_or_bcast4v4d",
+ // "test_or2d",
+ // "test_or3d",
+ // "test_or4d",
+ "test_pow_bcast_array",
+ "test_pow_bcast_scalar",
+ "test_pow_example",
+ // "test_pow_types_float",
+ // "test_pow_types_float32_int32",
+ // "test_pow_types_float32_int64",
+ // "test_pow_types_float32_uint32",
+ // "test_pow_types_float32_uint64",
+ // "test_pow_types_int",
+ // "test_pow_types_int32_float32",
+ // "test_pow_types_int32_int32",
+ // "test_pow_types_int64_float32",
+ // "test_pow_types_int64_int64",
+ "test_pow",
+ // "test_prelu_broadcast",
+ // "test_prelu_example",
+ // // "test_qlinearconv",
+ // // "test_qlinearmatmul_2D",
+ // // "test_qlinearmatmul_3D",
+ // // "test_quantizelinear_axis",
+ // // "test_quantizelinear",
+ // "test_range_float_type_positive_delta_expanded",
+ // "test_range_float_type_positive_delta",
+ // "test_range_int32_type_negative_delta_expanded",
+ // "test_range_int32_type_negative_delta",
+ // "test_reciprocal_example",
+ // "test_reciprocal",
+ // "test_reduce_l1_default_axes_keepdims_example",
+ // "test_reduce_l1_default_axes_keepdims_random",
+ // "test_reduce_l1_do_not_keepdims_example",
+ // "test_reduce_l1_do_not_keepdims_random",
+ // "test_reduce_l1_keep_dims_example",
+ // "test_reduce_l1_keep_dims_random",
+ // "test_reduce_l1_negative_axes_keep_dims_example",
+ // "test_reduce_l1_negative_axes_keep_dims_random",
+ // "test_reduce_l2_default_axes_keepdims_example",
+ // "test_reduce_l2_default_axes_keepdims_random",
+ // "test_reduce_l2_do_not_keepdims_example",
+ // "test_reduce_l2_do_not_keepdims_random",
+ // "test_reduce_l2_keep_dims_example",
+ // "test_reduce_l2_keep_dims_random",
+ // "test_reduce_l2_negative_axes_keep_dims_example",
+ // "test_reduce_l2_negative_axes_keep_dims_random",
+ // "test_reduce_log_sum_asc_axes",
+ // "test_reduce_log_sum_default",
+ // "test_reduce_log_sum_desc_axes",
+ // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
+ // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+ // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+ // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+ // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+ // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+ // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+ // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+ // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+ // "test_reduce_log_sum_negative_axes",
+ // "test_reduce_log_sum",
+ // "test_reduce_max_default_axes_keepdim_example",
+ // "test_reduce_max_default_axes_keepdims_random",
+ // "test_reduce_max_do_not_keepdims_example",
+ // "test_reduce_max_do_not_keepdims_random",
+ // "test_reduce_max_keepdims_example",
+ // "test_reduce_max_keepdims_random",
+ // "test_reduce_max_negative_axes_keepdims_example",
+ // "test_reduce_max_negative_axes_keepdims_random",
+ // "test_reduce_mean_default_axes_keepdims_example",
+ // "test_reduce_mean_default_axes_keepdims_random",
+ // "test_reduce_mean_do_not_keepdims_example",
+ // "test_reduce_mean_do_not_keepdims_random",
+ // "test_reduce_mean_keepdims_example",
+ // "test_reduce_mean_keepdims_random",
+ // "test_reduce_mean_negative_axes_keepdims_example",
+ // "test_reduce_mean_negative_axes_keepdims_random",
+ // "test_reduce_min_default_axes_keepdims_example",
+ // "test_reduce_min_default_axes_keepdims_random",
+ // "test_reduce_min_do_not_keepdims_example",
+ // "test_reduce_min_do_not_keepdims_random",
+ // "test_reduce_min_keepdims_example",
+ // "test_reduce_min_keepdims_random",
+ // "test_reduce_min_negative_axes_keepdims_example",
+ // "test_reduce_min_negative_axes_keepdims_random",
+ // "test_reduce_prod_default_axes_keepdims_example",
+ // "test_reduce_prod_default_axes_keepdims_random",
+ // "test_reduce_prod_do_not_keepdims_example",
+ // "test_reduce_prod_do_not_keepdims_random",
+ // "test_reduce_prod_keepdims_example",
+ // "test_reduce_prod_keepdims_random",
+ // "test_reduce_prod_negative_axes_keepdims_example",
+ // "test_reduce_prod_negative_axes_keepdims_random",
+ // "test_reduce_sum_default_axes_keepdims_example",
+ // "test_reduce_sum_default_axes_keepdims_random",
+ // "test_reduce_sum_do_not_keepdims_example",
+ // "test_reduce_sum_do_not_keepdims_random",
+ // "test_reduce_sum_empty_axes_input_noop_example",
+ // "test_reduce_sum_empty_axes_input_noop_random",
+ // "test_reduce_sum_keepdims_example",
+ // "test_reduce_sum_keepdims_random",
+ // "test_reduce_sum_negative_axes_keepdims_example",
+ // "test_reduce_sum_negative_axes_keepdims_random",
+ // "test_reduce_sum_square_default_axes_keepdims_example",
+ // "test_reduce_sum_square_default_axes_keepdims_random",
+ // "test_reduce_sum_square_do_not_keepdims_example",
+ // "test_reduce_sum_square_do_not_keepdims_random",
+ // "test_reduce_sum_square_keepdims_example",
+ // "test_reduce_sum_square_keepdims_random",
+ // "test_reduce_sum_square_negative_axes_keepdims_example",
+ // "test_reduce_sum_square_negative_axes_keepdims_random",
+ // "test_reflect_pad",
+ "test_relu",
+ // "test_reshape_allowzero_reordered",
+ // "test_reshape_extended_dims",
+ // "test_reshape_negative_dim",
+ // "test_reshape_negative_extended_dims",
+ // "test_reshape_one_dim",
+ // "test_reshape_reduced_dims",
+ // "test_reshape_reordered_all_dims",
+ // "test_reshape_reordered_dims",
+ // "test_reshape_reordered_last_dims",
+ // "test_reshape_zero_and_negative_dim",
+ // "test_reshape_zero_dim",
+ // "test_resize_downsample_linear",
+ // "test_resize_downsample_nearest",
+ // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+ // "test_resize_downsample_scales_cubic_align_corners",
+ // "test_resize_downsample_scales_cubic",
+ // "test_resize_downsample_scales_linear_align_corners",
+ // "test_resize_downsample_scales_linear",
+ // "test_resize_downsample_scales_nearest",
+ // "test_resize_downsample_sizes_cubic",
+ // "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+ // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+ // "test_resize_downsample_sizes_nearest",
+ // "test_resize_nearest",
+ // "test_resize_tf_crop_and_resize",
+ // "test_resize_upsample_linear",
+ // "test_resize_upsample_nearest",
+ // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+ // "test_resize_upsample_scales_cubic_align_corners",
+ // "test_resize_upsample_scales_cubic_asymmetric",
+ // "test_resize_upsample_scales_cubic",
+ // "test_resize_upsample_scales_linear_align_corners",
+ // "test_resize_upsample_scales_linear",
+ // "test_resize_upsample_scales_nearest",
+ // "test_resize_upsample_sizes_cubic",
+ // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
+ // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
+ // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+ // "test_resize_upsample_sizes_nearest",
+ // // "test_reversesequence_batch",
+ // // "test_reversesequence_time",
+ // // "test_rnn_seq_length",
+ // // "test_roialign_aligned_false",
+ // // "test_roialign_aligned_true",
+ // // "test_roialign",
+ // // "test_round",
+ // // "test_scan_sum",
+ // // "test_scan9_sum",
+ // // "test_scatter_elements_with_axis",
+ // // "test_scatter_elements_with_duplicate_indices",
+ // // "test_scatter_elements_with_negative_indices",
+ // // "test_scatter_elements_without_axis",
+ // // "test_scatter_with_axis",
+ // // "test_scatter_without_axis",
+ // // "test_scatternd_add",
+ // // "test_scatternd_multiply",
+ // // "test_scatternd",
+ // // "test_sce_mean_3d_expanded",
+ // // "test_sce_mean_3d_log_prob_expanded",
+ // // "test_sce_mean_3d_log_prob",
+ // // "test_sce_mean_3d",
+ // // "test_sce_mean_expanded",
+ // // "test_sce_mean_log_prob_expanded",
+ // // "test_sce_mean_log_prob",
+ // // "test_sce_mean_no_weight_ii_3d_expanded",
+ // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
+ // // "test_sce_mean_no_weight_ii_3d_log_prob",
+ // // "test_sce_mean_no_weight_ii_3d",
+ // // "test_sce_mean_no_weight_ii_4d_expanded",
+ // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
+ // // "test_sce_mean_no_weight_ii_4d_log_prob",
+ // // "test_sce_mean_no_weight_ii_4d",
+ // // "test_sce_mean_no_weight_ii_expanded",
+ // // "test_sce_mean_no_weight_ii_log_prob_expanded",
+ // // "test_sce_mean_no_weight_ii_log_prob",
+ // // "test_sce_mean_no_weight_ii",
+ // // "test_sce_mean_weight_expanded",
+ // // "test_sce_mean_weight_ii_3d_expanded",
+ // // "test_sce_mean_weight_ii_3d_log_prob_expanded",
+ // // "test_sce_mean_weight_ii_3d_log_prob",
+ // // "test_sce_mean_weight_ii_3d",
+ // // "test_sce_mean_weight_ii_4d_expanded",
+ // // "test_sce_mean_weight_ii_4d_log_prob_expanded",
+ // // "test_sce_mean_weight_ii_4d_log_prob",
+ // // "test_sce_mean_weight_ii_4d",
+ // // "test_sce_mean_weight_ii_expanded",
+ // // "test_sce_mean_weight_ii_log_prob_expanded",
+ // // "test_sce_mean_weight_ii_log_prob",
+ // // "test_sce_mean_weight_ii",
+ // // "test_sce_mean_weight_log_prob_expanded",
+ // // "test_sce_mean_weight_log_prob",
+ // // "test_sce_mean_weight",
+ // // "test_sce_mean",
+ // // "test_sce_NCd1_mean_weight_negative_ii_expanded",
+ // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
+ // // "test_sce_NCd1_mean_weight_negative_ii_log_prob",
+ // // "test_sce_NCd1_mean_weight_negative_ii",
+ // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+ // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+ // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+ // // "test_sce_NCd1d2d3_none_no_weight_negative_ii",
+ // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+ // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+ // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+ // // "test_sce_NCd1d2d3_sum_weight_high_ii",
+ // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+ // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+ // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+ // // "test_sce_NCd1d2d3d4d5_mean_weight",
+ // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+ // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+ // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+ // // "test_sce_NCd1d2d3d4d5_none_no_weight",
+ // // "test_sce_none_expanded",
+ // // "test_sce_none_log_prob_expanded",
+ // // "test_sce_none_log_prob",
+ // // "test_sce_none_weights_expanded",
+ // // "test_sce_none_weights_log_prob_expanded",
+ // // "test_sce_none_weights_log_prob",
+ // // "test_sce_none_weights",
+ // // "test_sce_none",
+ // // "test_sce_sum_expanded",
+ // // "test_sce_sum_log_prob_expanded",
+ // // "test_sce_sum_log_prob",
+ // // "test_sce_sum",
+ // "test_selu_default",
+ // "test_selu_example",
+ // "test_selu",
+ // // "test_sequence_insert_at_back",
+ // // "test_sequence_insert_at_front",
+ // // "test_sequence_map_add_1_sequence_1_tensor_expanded",
+ // // "test_sequence_map_add_1_sequence_1_tensor",
+ // // "test_sequence_map_add_2_sequences_expanded",
+ // // "test_sequence_map_add_2_sequences",
+ // // "test_sequence_map_extract_shapes_expanded",
+ // // "test_sequence_map_extract_shapes",
+ // // "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+ // // "test_sequence_map_identity_1_sequence_1_tensor",
+ // // "test_sequence_map_identity_1_sequence_expanded",
+ // // "test_sequence_map_identity_1_sequence",
+ // // "test_sequence_map_identity_2_sequences_expanded",
+ // // "test_sequence_map_identity_2_sequences",
+ // "test_shrink_hard",
+ // "test_shrink_soft",
+ "test_sigmoid_example",
+ "test_sigmoid",
+ // "test_sign",
+ // "test_simple_rnn_batchwise",
+ // "test_simple_rnn_defaults",
+ // "test_simple_rnn_with_initial_bias",
+ "test_sin_example",
+ "test_sin",
+ // "test_sinh_example",
+ // "test_sinh",
+ // // "test_size_example",
+ // // "test_size",
+ // "test_slice_default_axes",
+ // "test_slice_default_steps",
+ // "test_slice_end_out_of_bounds",
+ // "test_slice_neg_steps",
+ // "test_slice_neg",
+ // "test_slice_negative_axes",
+ // "test_slice_start_out_of_bounds",
+ // "test_slice",
+ // "test_softmax_axis_0_expanded",
+ "test_softmax_axis_0",
+ // "test_softmax_axis_1_expanded",
+ "test_softmax_axis_1",
+ // "test_softmax_axis_2_expanded",
+ "test_softmax_axis_2",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob",
+ // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+ // "test_softmax_cross_entropy_mean_3d_expanded",
+ // "test_softmax_cross_entropy_mean_3d_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_3d_log_prob",
+ // "test_softmax_cross_entropy_mean_3d",
+ // "test_softmax_cross_entropy_mean_expanded",
+ // "test_softmax_cross_entropy_mean_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_log_prob",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob",
+ // "test_softmax_cross_entropy_mean_no_weight_ignore_index",
+ // "test_softmax_cross_entropy_mean_weight_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_3d",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_4d",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob",
+ // "test_softmax_cross_entropy_mean_weight_ignore_index",
+ // "test_softmax_cross_entropy_mean_weight_log_prob_expanded",
+ // "test_softmax_cross_entropy_mean_weight_log_prob",
+ // "test_softmax_cross_entropy_mean_weight",
+ // "test_softmax_cross_entropy_mean",
+ // "test_softmax_cross_entropy_none_expanded",
+ // "test_softmax_cross_entropy_none_log_prob_expanded",
+ // "test_softmax_cross_entropy_none_log_prob",
+ // "test_softmax_cross_entropy_none_weights_expanded",
+ // "test_softmax_cross_entropy_none_weights_log_prob_expanded",
+ // "test_softmax_cross_entropy_none_weights_log_prob",
+ // "test_softmax_cross_entropy_none_weights",
+ // "test_softmax_cross_entropy_none",
+ // "test_softmax_cross_entropy_sum_expanded",
+ // "test_softmax_cross_entropy_sum_log_prob_expanded",
+ // "test_softmax_cross_entropy_sum_log_prob",
+ // "test_softmax_cross_entropy_sum",
+ // "opset13/test_softmax_default_axis_expanded",
+ "opset13/test_softmax_default_axis",
+ // "test_softmax_example_expanded",
+ "test_softmax_example",
+ // "test_softmax_large_number_expanded",
+ "test_softmax_large_number",
+ // "test_softmax_negative_axis_expanded",
+ "test_softmax_negative_axis",
+ // // "test_softplus_example",
+ // // "test_softplus",
+ // // "test_softsign_example",
+ // // "test_softsign",
+ // "test_spacetodepth_example",
+ // "test_spacetodepth",
+ // "test_split_equal_parts_1d",
+ // "test_split_equal_parts_2d",
+ // "test_split_equal_parts_default_axis",
+ // "test_split_variable_parts_1d",
+ // "test_split_variable_parts_2d",
+ // "test_split_variable_parts_default_axis",
+ // "test_split_zero_size_splits",
+ "test_sqrt_example",
+ "test_sqrt",
+ // "test_squeeze_negative_axes",
+ // "test_squeeze",
+ // // "test_stft_with_window",
+ // // "test_stft",
+ // // "test_strnormalizer_export_monday_casesensintive_lower",
+ // // "test_strnormalizer_export_monday_casesensintive_nochangecase",
+ // // "test_strnormalizer_export_monday_casesensintive_upper",
+ // // "test_strnormalizer_export_monday_empty_output",
+ // // "test_strnormalizer_export_monday_insensintive_upper_twodim",
+ // // "test_strnormalizer_nostopwords_nochangecase",
+ "test_sub_bcast",
+ "test_sub_example",
+ // "test_sub_uint8",
+ "test_sub",
+ // "test_sum_example",
+ // "test_sum_one_input",
+ // "test_sum_two_inputs",
+ "test_tan_example",
+ "test_tan",
+ "test_tanh_example",
+ "test_tanh",
+ // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
+ // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
+ // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
+ // // "test_tfidfvectorizer_tf_only_bigrams_skip0",
+ // // "test_tfidfvectorizer_tf_onlybigrams_levelempty",
+ // // "test_tfidfvectorizer_tf_onlybigrams_skip5",
+ // // "test_tfidfvectorizer_tf_uniandbigrams_skip5",
+ // "test_thresholdedrelu_default",
+ // "test_thresholdedrelu_example",
+ // "test_thresholdedrelu",
+ // "test_tile_precomputed",
+ // "test_tile",
+ // // "test_top_k_negative_axis",
+ // // "test_top_k_smallest",
+ // // "test_top_k",
+ // // "test_training_dropout_default_mask",
+ // // "test_training_dropout_default",
+ // // "test_training_dropout_mask",
+ // // "test_training_dropout_zero_ratio_mask",
+ // // "test_training_dropout_zero_ratio",
+ // // "test_training_dropout",
+ "test_transpose_all_permutations_0",
+ "test_transpose_all_permutations_1",
+ "test_transpose_all_permutations_2",
+ "test_transpose_all_permutations_3",
+ "test_transpose_all_permutations_4",
+ "test_transpose_all_permutations_5",
+ "test_transpose_default"
+ // "test_tril_neg",
+ // "test_tril_one_row_neg",
+ // "test_tril_out_neg",
+ // "test_tril_out_pos",
+ // "test_tril_pos",
+ // "test_tril_square_neg",
+ // "test_tril_square",
+ // "test_tril_zero",
+ // "test_tril",
+ // "test_triu_neg",
+ // "test_triu_one_row",
+ // "test_triu_out_neg_out",
+ // "test_triu_out_pos",
+ // "test_triu_pos",
+ // "test_triu_square_neg",
+ // "test_triu_square",
+ // "test_triu_zero",
+ // "test_triu",
+ // // "test_unique_not_sorted_without_axis",
+ // // "test_unique_sorted_with_axis_3d",
+ // // "test_unique_sorted_with_axis",
+ // // "test_unique_sorted_with_negative_axis",
+ // // "test_unique_sorted_without_axis",
+ // "test_unsqueeze_axis_0",
+ // "test_unsqueeze_axis_1",
+ // "test_unsqueeze_axis_2",
+ // "test_unsqueeze_axis_3",
+ // "test_unsqueeze_negative_axes",
+ // "test_unsqueeze_three_axes",
+ // "test_unsqueeze_two_axes",
+ // "test_unsqueeze_unsorted_axes",
+ // "test_unsqueeze",
+ // "test_wrap_pad"
+ // "test_upsample_nearest",
+ // "test_where_example",
+ // "test_where_long_example",
+ // "test_xor_bcast3v1d",
+ // "test_xor_bcast3v2d",
+ // "test_xor_bcast4v2d",
+ // "test_xor_bcast4v3d",
+ // "test_xor_bcast4v4d",
+ // "test_xor2d",
+ // "test_xor3d",
+ // "test_xor4d"
],
"ops": []
}
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 9bd0ec1425f95..2d83ce1e095ce 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,8 +110,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
let context: ModelTestContext;
before('prepare session', async () => {
- context = await ModelTestContext.create(
- test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
+ context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options);
});
after('release session', async () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 5e9b0910a2c68..6d5951be7b1e6 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -137,8 +137,9 @@ async function loadTensors(
}
async function initializeSession(
- modelFilePath: string, backendHint: string, ioBindingMode: Test.IOBindingMode, profile: boolean,
- sessionOptions: ort.InferenceSession.SessionOptions, fileCache?: FileCacheBuffer): Promise {
+ modelFilePath: string, backendHint: ort.InferenceSession.ExecutionProviderConfig, ioBindingMode: Test.IOBindingMode,
+ profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions,
+ fileCache?: FileCacheBuffer): Promise {
const preloadModelData: Uint8Array|undefined =
fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
Logger.verbose(
@@ -232,9 +233,8 @@ export class ModelTestContext {
/**
* create a ModelTestContext object that used in every test cases in the given ModelTest.
*/
- static async create(
- modelTest: Test.ModelTest, profile: boolean,
- sessionOptions?: ort.InferenceSession.SessionOptions): Promise {
+ static async create(modelTest: Test.ModelTest, profile: boolean, testOptions?: Test.Options):
+ Promise {
if (this.initializing) {
throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
}
@@ -243,8 +243,12 @@ export class ModelTestContext {
this.initializing = true;
const initStart = now();
+ const executionProviderConfig =
+ modelTest.backend === 'webnn' ? (testOptions?.webnnOptions || 'webnn') : modelTest.backend!;
const session = await initializeSession(
- modelTest.modelUrl, modelTest.backend!, modelTest.ioBinding, profile, sessionOptions || {}, this.cache);
+ modelTest.modelUrl, executionProviderConfig, modelTest.ioBinding, profile, testOptions?.sessionOptions || {},
+ this.cache);
+
const initEnd = now();
for (const testCase of modelTest.cases) {
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index 5bdc8d84cc7a5..cd008e82e570b 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -143,6 +143,7 @@ export declare namespace Test {
cudaFlags?: Record;
wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+ webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
globalEnvFlags?: EnvOptions;
}
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index c9ed23895b60c..da489a6901512 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -133,6 +133,10 @@ constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FL
// Default value for the above setting.
constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
+// Environment variable to enable loading more KV data in flight in
+// DecoderMaskedMultiHeadAttention/DecoderMaskedSelfAttention kernels
+constexpr const char* kDecoderMaskedAttentionLoadKVDataInFlight = "ORT_DECODER_MASKED_ATTENTION_LOAD_KV_DATA_IN_FLIGHT";
+
} // namespace attention
} // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
index 54aad9cbaf387..a9b60da0c96ca 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
@@ -70,6 +70,10 @@ Status DecoderMaskedMultiHeadAttention::ComputeInternal(OpKernelContext*
auto& device_prop = GetDeviceProp();
DecoderMaskedMultiHeadAttentionParams parameters;
+
+ parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault(
+ attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
bool is_dmmha_packing = (key == nullptr && value == nullptr);
ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs(query,
key,
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
index 69ed07101e647..72ede2e22b557 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
@@ -52,6 +52,10 @@ Status DecoderMaskedSelfAttention::ComputeInternal(OpKernelContext* cont
auto& device_prop = GetDeviceProp();
DecoderMaskedMultiHeadAttentionParams parameters;
+
+ parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault(
+ attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
weights->Shape(),
bias->Shape(),
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
index 33e7a33494778..9efb6f08e8e99 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -344,52 +344,148 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
bool has_beams = params.cache_indir != nullptr && !params.is_cross_attention;
const int* beam_indices = has_beams ? ¶ms.cache_indir[bi_max_seq_length] : nullptr;
- for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
- bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
+ if (!params.kv_data_in_flight) {
+ for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+ bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
- // The keys loaded from the key cache.
- K_vec_k k_vec[K_VECS_PER_THREAD];
- if (ti < tlength) {
- if (has_beams) {
- const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
+ // The keys loaded from the key cache.
+ K_vec_k k_vec[K_VECS_PER_THREAD];
+ if (ti < tlength) {
+ if (has_beams) {
+ const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
#pragma unroll
- for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
- int jj = ii * params.max_sequence_length + ti;
+ for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+ int jj = ii * params.max_sequence_length + ti;
- k_vec[ii] = vec_conversion(
- (*reinterpret_cast(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
- }
- } else {
+ k_vec[ii] = vec_conversion(
+ (*reinterpret_cast(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
+ }
+ } else {
#pragma unroll
- for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
- int jj = ii * params.max_sequence_length + ti;
+ for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+ int jj = ii * params.max_sequence_length + ti;
- k_vec[ii] = vec_conversion(
- (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+ k_vec[ii] = vec_conversion(
+ (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+ }
}
}
- }
- // Perform the dot product and normalize qk.
- // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
- float qk = Qk_dot::dot(q_vec, k_vec) * inv_sqrt_dh;
+ // Perform the dot product and normalize qk.
+ // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+ float qk = Qk_dot::dot(q_vec, k_vec) * inv_sqrt_dh;
- // This is a deviation from FasterTransformer kernel implementation
- // but this aligns with ORT's other Attention kernels which strives to
- // mimic PyTorch when dealing with mask filter values
- if (is_masked) {
- qk += params.mask_filter_value;
+ // This is a deviation from FasterTransformer kernel implementation
+ // but this aligns with ORT's other Attention kernels which strives to
+ // mimic PyTorch when dealing with mask filter values
+ if (is_masked) {
+ qk += params.mask_filter_value;
+ }
+
+ // Store the product to shared memory. There's one qk value per timestep. Update the max.
+ if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+ if (params.relative_attention_bias != nullptr) {
+ qk = add_vec(qk,
+ reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+ }
+ qk_max = fmaxf(qk_max, qk);
+ qk_smem[ti] = qk;
+ }
}
+ } else {
+ // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+ // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+ constexpr int K_CACHE_DATA_LOAD_UNROLL = 4;
- // Store the product to shared memory. There's one qk value per timestep. Update the max.
- if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
- if (params.relative_attention_bias != nullptr) {
- qk = add_vec(qk,
- reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+ for (int ti = ko; ti < ti_end; ti += (K_CACHE_DATA_LOAD_UNROLL * K_PER_ITER)) {
+ int is_masked[K_CACHE_DATA_LOAD_UNROLL];
+ int beam_offset[K_CACHE_DATA_LOAD_UNROLL];
+ int time_step[K_CACHE_DATA_LOAD_UNROLL];
+ bool time_bounds_cond[K_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ is_masked[k_unroll] = 1;
+ beam_offset[k_unroll] = 0;
+ time_step[k_unroll] = ti + k_unroll * K_PER_ITER;
+ time_bounds_cond[k_unroll] = (time_step[k_unroll] < tlength);
+ }
+
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ if (time_bounds_cond[k_unroll] && params.mask != nullptr) {
+ is_masked[k_unroll] = params.mask[bi_total_seq_length + time_step[k_unroll]];
+ }
+ }
+
+ if (has_beams) {
+ int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ if (time_bounds_cond[k_unroll]) {
+ beam_offset[k_unroll] = beam_indices[time_step[k_unroll]] * head_maxlength_headsize_prod;
+ }
+ }
+ }
+
+ // The keys loaded from the key cache.
+ K_vec_k k_vec[K_CACHE_DATA_LOAD_UNROLL][K_VECS_PER_THREAD];
+
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ if (time_bounds_cond[k_unroll]) {
+ if (has_beams) {
+#pragma unroll
+ for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+ int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+ k_vec[k_unroll][ii] = vec_conversion(
+ (*reinterpret_cast(&k_cache_batch[beam_offset[k_unroll] + jj * QK_ELTS_IN_16B])));
+ }
+ } else {
+#pragma unroll
+ for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+ int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+ k_vec[k_unroll][ii] = vec_conversion(
+ (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+ }
+ }
+ }
+ }
+
+ // Perform the dot product and normalize qk.
+ // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+ float qk[K_CACHE_DATA_LOAD_UNROLL];
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ qk[k_unroll] = Qk_dot::dot(q_vec, k_vec[k_unroll]) * inv_sqrt_dh;
+ }
+
+// This is a deviation from FasterTransformer kernel implementation
+// but this aligns with ORT's other Attention kernels which strives to
+// mimic PyTorch when dealing with mask filter values
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ if (time_bounds_cond[k_unroll] && is_masked[k_unroll] == 0) {
+ qk[k_unroll] += params.mask_filter_value;
+ }
+ }
+
+// Store the product to shared memory. There's one qk value per timestep. Update the max.
+#pragma unroll
+ for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+ if (time_bounds_cond[k_unroll] && (tidx % THREADS_PER_KEY == 0)) {
+ if (params.relative_attention_bias != nullptr) {
+ qk[k_unroll] = add_vec(qk[k_unroll],
+ reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + time_step[k_unroll]]);
+ }
+ qk_max = fmaxf(qk_max, qk[k_unroll]);
+ qk_smem[time_step[k_unroll]] = qk[k_unroll];
+ }
}
- qk_max = fmaxf(qk_max, qk);
- qk_smem[ti] = qk;
}
}
@@ -504,18 +600,80 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
V_vec_acum out;
zero(out);
- // Loop over the timesteps to compute the partial outputs.
- for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
- // Fetch offset based on cache_indir when beam sampling
- const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
- const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+ if (!params.kv_data_in_flight) {
+ // Loop over the timesteps to compute the partial outputs.
+ for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
+ // Fetch offset based on cache_indir when beam sampling
+ const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
+ const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+
+ // Load the values from the cache.
+ V_vec_k v = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset + ti * head_size]));
+
+ // Load the logits from shared memory.
+ T logit = logits_smem[ti];
+ out = fma(logit, v, out);
+ }
+ } else {
+ // Loop over the timesteps to compute the partial outputs.
+
+ // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+ // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+ constexpr int V_CACHE_DATA_LOAD_UNROLL = 8;
+
+ for (int ti = vo; ti < tlength; ti += V_CACHE_DATA_LOAD_UNROLL * V_PER_ITER) {
+ int beam_src[V_CACHE_DATA_LOAD_UNROLL];
+ int beam_offset[V_CACHE_DATA_LOAD_UNROLL];
+ int time_step[V_CACHE_DATA_LOAD_UNROLL];
+ bool time_bounds_cond[V_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+ for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+ beam_src[v_unroll] = 0;
+ beam_offset[v_unroll] = 0;
+ time_step[v_unroll] = ti + v_unroll * V_PER_ITER;
+ time_bounds_cond[v_unroll] = (time_step[v_unroll] < tlength);
+ }
+
+ int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+ if (has_beams) {
+// Do the global memory read and corresponding compute in separate unrolled loops
+#pragma unroll
+ for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+ if (time_bounds_cond[v_unroll]) {
+ beam_src[v_unroll] = params.cache_indir[bi_max_seq_length + time_step[v_unroll]];
+ }
+ }
+
+#pragma unroll
+ for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+ if (time_bounds_cond[v_unroll]) {
+ beam_offset[v_unroll] = beam_src[v_unroll] * head_maxlength_headsize_prod;
+ }
+ }
+ }
- // Load the values from the cache.
- V_vec_k v = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset + ti * head_size]));
+ // Load the values from the V-cache and logits from shared memory.
+ V_vec_k v[V_CACHE_DATA_LOAD_UNROLL];
+ T logits[V_CACHE_DATA_LOAD_UNROLL];
- // Load the logits from shared memory.
- T logit = logits_smem[ti];
- out = fma(logit, v, out);
+// Do the global memory read and compute in separate unrolled loops
+#pragma unroll
+ for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+ if (time_bounds_cond[v_unroll]) {
+ v[v_unroll] = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset[v_unroll] + time_step[v_unroll] * head_size]));
+ logits[v_unroll] = logits_smem[time_step[v_unroll]];
+ }
+ }
+
+#pragma unroll
+ for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+ if (time_bounds_cond[v_unroll]) {
+ out = fma(logits[v_unroll], v[v_unroll], out);
+ }
+ }
+ }
}
// One group of threads computes the product(s) for the current timestep.
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
index 4b408dafa2d81..1a17757d1ec2d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
@@ -22,6 +22,12 @@ struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters {
bool is_cross_attention = false;
bool is_packed_qkv = false;
+ // Useful to better use global memory bandwidth on certain CUDA architectures.
+ // Turned off by default for now until we fully understand performance implications
+ // for all types of workloads.
+ // Can be turned on by appropriate environment variable (see attention_common.h).
+ bool kv_data_in_flight = false;
+
void* q = nullptr;
void* q_bias = nullptr;
@@ -62,4 +68,4 @@ void mmha_launch_kernel(const DecoderMaskedMultiHeadAttentionParams& params, cud
} // namespace cuda
} // namespace contrib
-} // namespace onnxruntime
+} // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc
index d597e0d57fbcb..63804f79a32fb 100644
--- a/onnxruntime/contrib_ops/rocm/fused_conv.cc
+++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc
@@ -76,7 +76,12 @@ struct FNVHash {
void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) {
int spatial_dim = 1;
#if ROCM_VERSION >= 50500
- miopenGetConvolutionSpatialDim(cdesc, &spatial_dim);
+ MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim));
+ std::vector pads{spatial_dim};
+ std::vector strides{spatial_dim};
+ std::vector dilations{spatial_dim};
+ miopenConvolutionMode_t mode;
+ MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode));
#else
// Previous versions of MIOpen doesn't provide API to probe the dimension of a
// miopenConvolutionDescriptor_t, so we have to guess.
@@ -100,11 +105,12 @@ struct FNVHash {
pads.resize(spatial_dim);
strides.resize(spatial_dim);
dilations.resize(spatial_dim);
+#endif
(*this) << spatial_dim;
(*this) << pads;
(*this) << strides;
(*this) << dilations;
-#endif
+ (*this) << mode;
}
private:
@@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv {
auto ret = miopenCompileFusionPlan(handle, fusion->plan);
if (miopenStatusSuccess == ret) {
fusion->compiled_on.insert(handle);
+ } else {
+ return ret;
}
return miopenStatusSuccess;
}
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 40c59cfcf699d..796a018ac0f68 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -65,6 +65,11 @@ struct FreeDimensionOverride {
* Configuration information for a session.
*/
struct SessionOptions {
+#if defined(__wasm__) && defined(__EMSCRIPTEN_PTHREADS__)
+ static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = false;
+#else
+ static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = true;
+#endif
ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL;
// set the execution order of the graph
@@ -129,7 +134,8 @@ struct SessionOptions {
// By default the session uses its own set of threadpools, unless this is set to false.
// Use this in conjunction with the CreateEnvWithGlobalThreadPools API.
- bool use_per_session_threads = true;
+ bool use_per_session_threads = DEFAULT_USE_PER_SESSION_THREADS;
+
bool thread_pool_allow_spinning = true;
// Deterministic compute is likely not as performant. This option is default to false.
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h
index f8f7dcd43699f..90fdd710e2773 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h
@@ -232,7 +232,7 @@ MlasSQNBitGemmOperation(
size_t RowsRemaining = RangeCountM;
while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
);
diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
index cc0f7854791d4..9d53e28921784 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
@@ -53,7 +53,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output
MakeString("Added by ", kTransformerName),
dq_inputs,
{&new_dq_output_nodearg},
- nullptr, // attributes
+ &original_dq_node.GetAttributes(),
original_dq_node.Domain());
// set up edges
diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h
index d2f297e83aedb..f5288d7f231b0 100644
--- a/onnxruntime/core/providers/acl/math/gemm.h
+++ b/onnxruntime/core/providers/acl/math/gemm.h
@@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm {
}
Status Compute(OpKernelContext* context) const override {
+#ifdef ACL_2308
+ if (this->packed_b_) {
+ // Prepacked RHS not supported, defaulting to cpu execution provider
+ return onnxruntime::Gemm::Compute(context);
+ }
+#endif
const auto A = context->Input(0);
const auto B = context->Input(1);
const auto C = context->Input(2);
- GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape());
+ GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans,
+ C != nullptr ? C->Shape() : TensorShape({}));
if (!helper.State().IsOK())
return helper.State();
@@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm {
return onnxruntime::Gemm::Compute(context);
}
- arm_compute::TensorShape cShape = ACLTensorShape(C->Shape());
+ arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({}));
if (useC &&
(cShape.num_dimensions() > 2 ||
(cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) { // Multi-dimensional Bias
@@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm {
(cShape[1] == 1 && cShape[0] != (long unsigned int)N)) {
return onnxruntime::Gemm::Compute(context);
}
+#ifdef ACL_2308
+ cShape = arm_compute::TensorShape(N);
+ LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}";
+#else
cShape = arm_compute::TensorShape(1, N);
LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}";
+#endif
}
int64_t K = helper.K();
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index da7fff730c96f..eb6a10074f1db 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -44,6 +44,16 @@ Status BatchNorm::Compute(OpKernelContext* context) const {
const Tensor* M = context->Input(3); // mean
const Tensor* V = context->Input(4); // var
+ if (S->Shape().NumDimensions() > 1) {
+ LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation";
+ return onnxruntime::BatchNorm::Compute(context);
+ }
+
+ if (this->is_train_) {
+ LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation";
+ return onnxruntime::BatchNorm::Compute(context);
+ }
+
ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V));
LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:";
@@ -70,7 +80,23 @@ Status BatchNorm::Compute(OpKernelContext* context) const {
auto layer = std::make_shared();
+#ifdef ACL_2308
+ arm_compute::TensorShape in_x_shape;
+ const TensorShape& x_shape = X->Shape();
+ const auto& dims_vec = x_shape.GetDims();
+ in_x_shape.set(3, onnxruntime::narrow(dims_vec[0])); // N
+ in_x_shape.set(1, 1); // H
+ size_t W = 1;
+ for (size_t i = 2; i < dims_vec.size(); ++i) {
+ W *= narrow(dims_vec[i]);
+ }
+ in_x_shape.set(0, W); // W
+ in_x_shape.set(2, onnxruntime::narrow(dims_vec[1])); // C
+
+ tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32));
+#else
tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32));
+#endif
tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32));
tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32));
@@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
7, 9,
kAclExecutionProvider,
KernelDefBuilder()
- .TypeConstraint("X", DataTypeImpl::GetTensorType())
- .TypeConstraint("scale", DataTypeImpl::GetTensorType())
- .TypeConstraint("B", DataTypeImpl::GetTensorType())
- .TypeConstraint("mean", DataTypeImpl::GetTensorType())
- .TypeConstraint("var", DataTypeImpl::GetTensorType()),
+ .TypeConstraint("T", DataTypeImpl::GetTensorType()),
BatchNorm);
} // namespace acl
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.h b/onnxruntime/core/providers/acl/nn/batch_norm.h
index c9ec08b67a779..264301976e6dc 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.h
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.h
@@ -31,9 +31,9 @@ typedef struct {
typedef std::map::iterator BatchNormLayersIterator;
template
-class BatchNorm final : public OpKernel {
+class BatchNorm : public onnxruntime::BatchNorm {
public:
- explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) {
+ explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm(info) {
auto st = info.GetAttr("epsilon", &epsilon_);
ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index 1613d927d0f74..85bd0cfe96279 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -105,7 +105,11 @@ Status Conv::Compute(OpKernelContext* context) const {
TensorShapeVector Y_dims;
Y_dims.insert(Y_dims.begin(), {N, M});
TensorShape input_shape = X->Shape().Slice(2);
+#ifdef ACL_2308
+ ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#else
ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#endif
Tensor* Y = context->Output(0, TensorShape(Y_dims));
LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str();
@@ -222,6 +226,15 @@ Status Conv::Compute(OpKernelContext* context) const {
1 /* depth multiplier */,
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
arm_compute::Size2D(aclDilation0, dilations[0])));
+#elif defined(ACL_2308)
+ bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
+ tconv.k->info(),
+ (B != nullptr) ? tconv.b->info() : nullptr,
+ tconv.out->info(),
+ aclPadStride,
+ 1 /* depth multiplier */,
+ acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
+ arm_compute::Size2D(aclDilation0, dilations[0])));
#endif
if (optimizable) {
@@ -230,7 +243,7 @@ Status Conv::Compute(OpKernelContext* context) const {
auto layer = std::make_shared();
#elif defined(ACL_1908)
auto layer = std::make_shared();
-#elif defined(ACL_2002)
+#elif defined(ACL_2002) || defined(ACL_2308)
auto layer = std::make_shared();
#endif
@@ -238,7 +251,7 @@ Status Conv::Compute(OpKernelContext* context) const {
layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
aclPadStride, 1 /* depth multiplier */,
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo());
-#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002)
+#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308)
layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
aclPadStride, 1 /* depth multiplier */,
acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
diff --git a/onnxruntime/core/providers/acl/nn/conv.h b/onnxruntime/core/providers/acl/nn/conv.h
index ecb11fb3c8f4e..660d47b4172df 100644
--- a/onnxruntime/core/providers/acl/nn/conv.h
+++ b/onnxruntime/core/providers/acl/nn/conv.h
@@ -8,6 +8,9 @@
#include "core/providers/acl/acl_execution_provider.h"
// ACL
+#ifdef ACL_2308
+#include "arm_compute/runtime/Tensor.h"
+#endif
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "arm_compute/runtime/Allocator.h"
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index dc79ae65bf21e..8fbcba3ed87a7 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
if (pool_attrs.global_pooling) {
- layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
+ layer->configure(tpool.in.get(),
+ tpool.out.get(),
+ arm_compute::PoolingLayerInfo(pool_type
+#ifdef ACL_2308
+ ,
+ arm_compute::DataLayout::NCHW
+#endif
+ ));
} else {
TensorShapeVector aclStrides(2);
aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
@@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
- arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
+ arm_compute::PoolingLayerInfo pool_info(pool_type,
+ aclSize,
+#ifdef ACL_2308
+ arm_compute::DataLayout::NCHW,
+#endif
+ aclPadStride,
+ excludePadding);
layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
}
diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc
index 081472729cfcf..75eedaac80aea 100644
--- a/onnxruntime/core/providers/acl/tensor/concat.cc
+++ b/onnxruntime/core/providers/acl/tensor/concat.cc
@@ -10,6 +10,8 @@
#include "core/providers/acl/acl_common.h"
#include "core/providers/acl/acl_fwd.h"
+#include
+
#define PREF_DIM 4
namespace onnxruntime {
@@ -22,17 +24,27 @@ Status Concat::Compute(OpKernelContext* ctx) const {
return onnxruntime::Concat::Compute(ctx);
}
+ if (axis_ < 0) {
+ LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation";
+ return onnxruntime::Concat::Compute(ctx);
+ }
+
// Number of input tensors to concatenate
auto input_count = Node().InputArgCount().front();
// Hold pointers to the input tensors to be used in the PrepareForCompute() step
std::vector input_tensors;
- input_tensors.reserve(input_count);
+ int empty_tensors = 0;
for (int i = 0; i < input_count; ++i) {
+ if (ctx->Input(i)->Shape().Size() == 0) {
+ empty_tensors++;
+ continue;
+ }
input_tensors.push_back(ctx->Input(i));
}
+ input_count -= empty_tensors;
- auto output_dims = input_tensors[0]->Shape().AsShapeVector();
+ auto output_dims = ctx->Input(0)->Shape().AsShapeVector();
// 'Concat' mode
if (!is_stack_) {
@@ -64,7 +76,11 @@ Status Concat::Compute(OpKernelContext* ctx) const {
LOGS_DEFAULT(VERBOSE) << "Concat ACL:";
arm_compute::Tensor output;
+#ifdef ACL_2308
+ std::vector inputs_vector;
+#else
std::vector inputs_vector;
+#endif
for (int i = 0; i < input_count; i++) {
arm_compute::Tensor* input = new arm_compute::Tensor();
auto X = input_tensors[i];
@@ -75,7 +91,9 @@ Status Concat::Compute(OpKernelContext* ctx) const {
}
arm_compute::NEConcatenateLayer layer;
- layer.configure(inputs_vector, &output, 3 - axis_);
+ if (input_count > 0) {
+ layer.configure(inputs_vector, &output, 3 - axis_);
+ }
LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
LOGS_DEFAULT(VERBOSE) << std::endl;
@@ -83,7 +101,11 @@ Status Concat::Compute(OpKernelContext* ctx) const {
for (int i = 0; i < input_count; i++) {
auto X = input_tensors[i];
const T* x_data = X->Data();
+#ifdef ACL_2308
+ arm_compute::Tensor* in = const_cast(static_cast(inputs_vector[i]));
+#else
arm_compute::Tensor* in = static_cast(inputs_vector[i]);
+#endif
if (X->Shape().Size() != 0 && in->info()->has_padding()) {
in->allocator()->allocate();
@@ -101,7 +123,9 @@ Status Concat::Compute(OpKernelContext* ctx) const {
ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4);
}
- layer.run();
+ if (input_count > 0) {
+ layer.run();
+ }
if (Y->Shape().Size() != 0 && output.info()->has_padding()) {
importDataFromTensor(&output, y_data);
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 75f6f8d2eddd5..9cd0b3d0620af 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -989,6 +989,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
#endif
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch);
// !!PLEASE READ BELOW!! Following that, add new entries above this comment
@@ -2448,6 +2449,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
#endif
BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
};
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.cc b/onnxruntime/core/providers/cpu/text/string_concat.cc
new file mode 100644
index 0000000000000..bc626f8e055aa
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.cc
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_concat.h"
+#include "core/providers/cpu/math/element_wise_ops.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(StringConcat, 20,
+ KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()),
+ StringConcat);
+
+Status StringConcat::Compute(OpKernelContext* context) const {
+ ProcessBroadcastSpanFuncs broadcast_funcs{[](BroadcastHelper& broadcast_helper) {
+ auto x = broadcast_helper.ScalarInput0();
+ auto y = broadcast_helper.SpanInput1();
+ auto y_iter = y.begin();
+ auto output_iter = broadcast_helper.OutputSpan().begin();
+ const auto x_size = x.length();
+ while (y_iter != y.end()) {
+ output_iter->reserve(x_size + y_iter->length());
+ output_iter->append(x);
+ output_iter->append(*y_iter);
+ y_iter++;
+ output_iter++;
+ }
+ },
+ [](BroadcastHelper& broadcast_helper) {
+ auto x = broadcast_helper.SpanInput0();
+ auto x_iter = x.begin();
+ auto y = broadcast_helper.ScalarInput1();
+ auto output_iter = broadcast_helper.OutputSpan().begin();
+ const auto y_size = y.length();
+ while (x_iter != x.end()) {
+ output_iter->reserve(y_size + x_iter->length());
+ output_iter->append(*x_iter);
+ output_iter->append(y);
+ x_iter++;
+ output_iter++;
+ }
+ },
+ [](BroadcastHelper& broadcast_helper) {
+ auto x_iter = broadcast_helper.SpanInput0().begin();
+ auto y_iter = broadcast_helper.SpanInput1().begin();
+ auto output = broadcast_helper.OutputSpan();
+ auto output_iter = output.begin();
+ while (output_iter != output.end()) {
+ output_iter->reserve(x_iter->length() + y_iter->length());
+ output_iter->append(*x_iter);
+ output_iter->append(*y_iter);
+ x_iter++;
+ y_iter++;
+ output_iter++;
+ }
+ }};
+ UntypedBroadcastTwo(*context, broadcast_funcs);
+ return Status::OK();
+}
+
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.h b/onnxruntime/core/providers/cpu/text/string_concat.h
new file mode 100644
index 0000000000000..63c1ea8a41146
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringConcat final : public OpKernel {
+ public:
+ StringConcat(const OpKernelInfo& info) : OpKernel(info) {}
+
+ Status Compute(OpKernelContext* context) const override;
+};
+
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index adb4fd131119f..c6a15e76f4736 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder
// The tensor description's size should be no larger than the constant input unless it was rounded to
// the required alignment.
assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
- size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes);
+ size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast(tensorDesc->totalTensorSizeInBytes));
auto data = static_cast(constantInput->GetData());
std::vector tensorData(data, data + minimumConstantSize);
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 5c2d1f0b881ba..b850bea4bc275 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -67,6 +67,7 @@ namespace js {
float value; \
ORT_ENFORCE(info.GetAttr(#attr_name, &value));, \
, ({#attr_name : $1}), static_cast(value))
+#define JSEP_HEAP_PTR(ptr) reinterpret_cast(ptr)
// TODO:
// class JsMultiProgramKernel : public OpKernel { /* TBD */ };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 5c0fbf93a4004..98a530c6b77f6 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -54,13 +54,13 @@ class ConvBase : public JsKernel {
static_cast(conv_attrs_.group),
static_cast(kernel_shape_0),
static_cast(local_pads.size()),
- reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+ JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
static_cast(channels_last),
- reinterpret_cast(&w_is_const_),
+ JSEP_HEAP_PTR(&w_is_const_),
conv_attrs_.activation.c_str(),
activation_params.size(),
- reinterpret_cast(activation_params_ptr) >> 2);
+ JSEP_HEAP_PTR(activation_params_ptr) >> 2);
} else {
JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
"format" : $11 ? "NHWC" : "NCHW",
@@ -81,14 +81,14 @@ class ConvBase : public JsKernel {
static_cast(kernel_shape_0),
static_cast(kernel_shape_1),
static_cast(local_pads.size()),
- reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+ JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
static_cast(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
static_cast(channels_last),
- reinterpret_cast(&w_is_const_),
+ JSEP_HEAP_PTR(&w_is_const_),
conv_attrs_.activation.c_str(),
activation_params.size(),
- reinterpret_cast(activation_params_ptr) >> 2);
+ JSEP_HEAP_PTR(activation_params_ptr) >> 2);
}
}
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 5d30dc851e00f..353a946e95c21 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel {
static_cast(pads_1),
static_cast(strides),
static_cast(channels_last),
- reinterpret_cast(&w_is_const_),
+ JSEP_HEAP_PTR(&w_is_const_),
gsl::narrow_cast(local_output_padding.size()),
- reinterpret_cast(local_output_padding_ptr) >> 2,
+ JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
gsl::narrow_cast(local_output_shape.size()),
- reinterpret_cast(local_output_shape_ptr) >> 2,
+ JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
conv_transpose_attrs_.activation.c_str());
} else {
constexpr size_t pads_vec_size = 4;
@@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel {
"activation" : UTF8ToString($13)
}),
static_cast(conv_transpose_attrs_.auto_pad),
- reinterpret_cast(local_dilations.data()) >> 2,
+ JSEP_HEAP_PTR(local_dilations.data()) >> 2,
static_cast(conv_transpose_attrs_.group),
- reinterpret_cast(local_kernel_shape.data()) >> 2,
- reinterpret_cast(local_pads.data()) >> 2,
- reinterpret_cast(local_strides.data()) >> 2,
+ JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2,
+ JSEP_HEAP_PTR(local_pads.data()) >> 2,
+ JSEP_HEAP_PTR(local_strides.data()) >> 2,
static_cast(channels_last),
- reinterpret_cast(&w_is_const_),
+ JSEP_HEAP_PTR(&w_is_const_),
gsl::narrow_cast(local_output_padding.size()),
- reinterpret_cast(local_output_padding_ptr) >> 2,
+ JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
gsl::narrow_cast(local_output_shape.size()),
- reinterpret_cast(local_output_shape_ptr) >> 2,
+ JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
conv_transpose_attrs_.activation.c_str());
}
}
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index 19168f40b4722..bf808be949cf8 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase {
static_cast(mode_),
static_cast(value_),
gsl::narrow_cast(pads.size()),
- reinterpret_cast((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+ JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2);
}
};
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index a5a4aa834c2ca..95c4f2bec230d 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -8,29 +8,29 @@
namespace onnxruntime {
namespace js {
-#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \
- template \
- class ReduceKernel : public JsKernel, public ReduceKernelBase { \
- public: \
- using ReduceKernelBase::axes_; \
- using ReduceKernelBase::noop_with_empty_axes_; \
- using ReduceKernelBase::keepdims_; \
- ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \
- std::vector axes(axes_.size()); \
- if (axes_.size() > 0) { \
- std::transform(axes_.begin(), axes_.end(), axes.begin(), \
- [](int64_t axis) { return gsl::narrow_cast(axis); }); \
- } \
- JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \
- "keepDims" : !!$1, \
- "noopWithEmptyAxes" : !!$2, \
- "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \
- }), \
- static_cast(keepdims_), \
- static_cast(noop_with_empty_axes_), \
- gsl::narrow_cast(axes.size()), \
- reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
- } \
+#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \
+ template \
+ class ReduceKernel : public JsKernel, public ReduceKernelBase { \
+ public: \
+ using ReduceKernelBase::axes_; \
+ using ReduceKernelBase::noop_with_empty_axes_; \
+ using ReduceKernelBase::keepdims_; \
+ ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \
+ std::vector axes(axes_.size()); \
+ if (axes_.size() > 0) { \
+ std::transform(axes_.begin(), axes_.end(), axes.begin(), \
+ [](int64_t axis) { return gsl::narrow_cast(axis); }); \
+ } \
+ JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \
+ "keepDims" : !!$1, \
+ "noopWithEmptyAxes" : !!$2, \
+ "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \
+ }), \
+ static_cast(keepdims_), \
+ static_cast(noop_with_empty_axes_), \
+ gsl::narrow_cast(axes.size()), \
+ JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
+ } \
};
JSEP_DEFINE_REDUCE_KERNEL(ReduceMax);
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 65854222ba988..4b1c288ae3015 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase {
}),
static_cast(antialias_),
gsl::narrow_cast(axes.size()),
- reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+ JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2,
resize_coordinate_transformation_mode.c_str(),
static_cast(cubic_coeff_a_),
static_cast(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 6792997025d65..989adabf029a5 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase {
"ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
"axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
gsl::narrow_cast(starts.size()),
- reinterpret_cast