diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml deleted file mode 100644 index bf73fede16c7f..0000000000000 --- a/.github/actions/rust-toolchain-setup/action.yml +++ /dev/null @@ -1,44 +0,0 @@ -# yaml-language-server: $schema=https://json.schemastore.org/github-action.json - -name: 'Rust toolchain setup' -description: 'Common setup steps for GitHub workflows for Rust projects' - -runs: - using: composite - steps: - - uses: dtolnay/rust-toolchain@1.71.0 - with: - components: clippy, rustfmt - - uses: extractions/setup-just@v1 - with: - just-version: '1.15.0' # optional semver specification, otherwise latest - - ### - ### Linux setup - ### - - name: rustup - # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds. - if: ${{ (runner.os == 'Linux') }} - run: | - rustup set profile minimal - rustup install - shell: bash - # - name: Cargo login - # if: ${{ (runner.os == 'Linux') }} - # run: just cargo-login-ci - # shell: bash - - ### - ### Windows setup - ### - - name: rustup - # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds. - if: ${{ (runner.os == 'Windows') }} - run: | - rustup set profile minimal - rustup install - shell: pwsh - # - name: Cargo login - # if: ${{ (runner.os == 'Windows') }} - # run: just cargo-login-ci-windows - # shell: pwsh diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml deleted file mode 100644 index 725c40c2ded53..0000000000000 --- a/.github/workflows/rust-ci.yml +++ /dev/null @@ -1,132 +0,0 @@ -name: Rust - -on: [pull_request] - -env: - CARGO_TERM_COLOR: always - RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug - RUST_BACKTRACE: 1 - MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml - -jobs: - fmt: - name: Rustfmt - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/rust-toolchain-setup - - name: vendor onnxruntime source - run: just vendor - - name: fmt - run: cargo fmt --all -- --check - - download: - name: Download prebuilt ONNX Runtime archive from build.rs - runs-on: ubuntu-latest - env: - ORT_RUST_STRATEGY: download - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/rust-toolchain-setup - - run: rustup target install x86_64-unknown-linux-gnu - - run: rustup target install x86_64-apple-darwin - - run: rustup target install i686-pc-windows-msvc - - run: rustup target install x86_64-pc-windows-msvc - # ****************************************************************** - - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu) - run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu) - run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz - # ****************************************************************** - - name: Download prebuilt archive (CPU, x86_64-apple-darwin) - run: cargo build --target x86_64-apple-darwin --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin) - run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz - # ****************************************************************** - - name: Download prebuilt archive (CPU, i686-pc-windows-msvc) - run: cargo build --target i686-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc) - run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip - # ****************************************************************** - - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc) - run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc) - run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip - # ****************************************************************** - - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu) - env: - ORT_USE_CUDA: "yes" - run: cargo build --target x86_64-unknown-linux-gnu --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu) - run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz - # ****************************************************************** - - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc) - env: - ORT_USE_CUDA: "yes" - run: cargo build --target x86_64-pc-windows-msvc --manifest-path ${{ env.MANIFEST_PATH }} - - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc) - run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip - - test: - name: Test Suite - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - target: - [ - x86_64-unknown-linux-gnu, - x86_64-apple-darwin, - x86_64-pc-windows-msvc, - i686-pc-windows-msvc, - ] - include: - - target: x86_64-unknown-linux-gnu - os: ubuntu-latest - - target: x86_64-apple-darwin - os: macos-latest - - target: x86_64-pc-windows-msvc - os: windows-latest - - target: i686-pc-windows-msvc - os: windows-latest - env: - CARGO_BUILD_TARGET: ${{ matrix.target }} - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/rust-toolchain-setup - - name: vendor onnxruntime source - run: just vendor - - run: rustup target install ${{ matrix.target }} - - name: Install additional packages (macOS) - if: contains(matrix.target, 'x86_64-apple-darwin') - run: brew install libomp - - name: Build (cargo build) - run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }} - - name: Build tests (cargo test) - run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }} - - name: Build onnxruntime with 'model-fetching' feature - run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching - - name: Test onnxruntime-sys - run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture - - name: Test onnxruntime - run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture - - clippy: - name: Clippy - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/rust-toolchain-setup - - name: vendor onnxruntime source - run: just vendor - - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings - - package-sys: - name: Package onnxruntime-sys - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/rust-toolchain-setup - - name: vendor onnxruntime source - run: just vendor - - run: cargo package --allow-dirty --package onnxruntime-sys diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml index b9de1b79e1d51..67f9d8b0ce392 100644 --- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml +++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml @@ -53,10 +53,6 @@ extends: BuildArch: x86 PythonPackageName: pythonx86 - - template: .pipelines/windowsai-steps.yml@self - parameters: - BuildArch: arm - - template: .pipelines/windowsai-steps.yml@self parameters: BuildArch: arm64 @@ -72,11 +68,6 @@ extends: PythonPackageName: pythonx86 Runtime: static - - template: .pipelines/windowsai-steps.yml@self - parameters: - BuildArch: arm - Runtime: static - - template: .pipelines/windowsai-steps.yml@self parameters: BuildArch: arm64 @@ -94,11 +85,9 @@ extends: dependsOn: - Windows_Packaging_x64_dynamic - Windows_Packaging_x86_dynamic - - Windows_Packaging_arm_dynamic - Windows_Packaging_arm64_dynamic - Windows_Packaging_x64_static - Windows_Packaging_x86_static - - Windows_Packaging_arm_static - Windows_Packaging_arm64_static condition: succeeded() steps: @@ -120,12 +109,6 @@ extends: artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic' targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64' - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - NuGet DirectML arm' - inputs: - artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm' - - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime' inputs: @@ -144,12 +127,6 @@ extends: artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static' targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime' - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime' - inputs: - artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime' - - task: PowerShell@2 displayName: 'Bundle NuGet and other binaries' inputs: @@ -194,17 +171,7 @@ extends: $arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package)) [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory) - $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse) - $arm_nuget_package = $nupkgs[0].FullName - $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName - $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package)) - [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory) - - $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse) - $arm_static_runtime_nuget_package = $nupkgs[0].FullName - $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName - $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package)) - [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory) + $x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native') $x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static') @@ -216,10 +183,7 @@ extends: $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native') $arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native') $arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static') - $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native') - $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native') - $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native') - $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static') + $uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native') $uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0') @@ -228,8 +192,6 @@ extends: New-Item -Path $x86_static_runtime_path_new -ItemType Directory New-Item -Path $arm64_runtime_path_new -ItemType Directory New-Item -Path $arm64_static_runtime_path_new -ItemType Directory - New-Item -Path $arm_runtime_path_new -ItemType Directory - New-Item -Path $arm_static_runtime_path_new -ItemType Directory Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll')) $x86_runtime_path_new Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib')) $x86_runtime_path_new @@ -241,11 +203,6 @@ extends: Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll')) $arm_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib')) $arm_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll')) Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib')) Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll')) @@ -261,11 +218,6 @@ extends: Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll')) Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib')) - Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll')) - Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib')) - Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll')) - Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib')) - Copy-Item -Recurse $uap_build_path_old $uap_build_path_new $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged') @@ -304,22 +256,13 @@ extends: $arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package)) [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory) - $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse) - $arm_nuget_package = $nupkgs[0].FullName - $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName - $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package)) - [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory) - $x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native') $x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native') $arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native') $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native') - $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native') - $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native') - + New-Item -Path $x86_runtime_path_new -ItemType Directory New-Item -Path $arm64_runtime_path_new -ItemType Directory - New-Item -Path $arm_runtime_path_new -ItemType Directory Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb')) $x86_runtime_path_new Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $x86_runtime_path_new @@ -327,9 +270,6 @@ extends: Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb')) $arm64_runtime_path_new Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb')) $arm_runtime_path_new - Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm_runtime_path_new - $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged') if (!(Test-Path $merged_nuget_path)) { New-Item -Path $merged_nuget_path -ItemType Directory diff --git a/.vscode/settings.json b/.vscode/settings.json index 2f2adc78f6de9..3e2b1f31dd6cf 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -11,7 +11,7 @@ // Auto sort imports "editor.formatOnSave": true, "editor.codeActionsOnSave": { - "source.organizeImports": true + "source.organizeImports": "explicit" }, "editor.defaultFormatter": "ms-python.black-formatter" }, diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 34355fb0fd936..1567da90cacfc 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -131,6 +131,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF) option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF) option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF) option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF) +option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF) option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF) option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON) option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON) @@ -354,13 +355,7 @@ if (onnxruntime_USE_ROCM) endif() endif() -if (APPLE) - if (NOT CMAKE_OSX_ARCHITECTURES) - message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}") - endif() -elseif (NOT WIN32 AND NOT APPLE) - message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}") -endif() + # Single output director for all binaries set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") @@ -493,6 +488,14 @@ endif() include(adjust_global_compile_flags.cmake) +if (APPLE) + if (NOT CMAKE_OSX_ARCHITECTURES) + message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH") + endif() +elseif (NOT WIN32 AND NOT APPLE) + message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH") +endif() + # We need to link with libatomic on systems that do not have built-in atomics, or # don't have built-in support for 8 byte atomics # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt @@ -639,7 +642,16 @@ else() check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE) check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST) check_function_exists(reallocarray HAS_REALLOCARRAY) - + if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64") + check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16) + if(NOT HAS_ARM64_BFLOAT16) + message(FATAL_ERROR "The compiler doesn't support BFLOAT16!!!") + endif() + check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16) + if(NOT HAS_ARM64_FLOAT16) + message(FATAL_ERROR "The compiler doesn't support FLOAT16!!!") + endif() + endif() if (HAS_TAUTOLOGICAL_POINTER_COMPARE) #we may have extra null pointer checkings in debug build, it's not an issue list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare) @@ -1099,7 +1111,7 @@ function(onnxruntime_add_include_to_target dst_target) endfunction() # ACL -if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002) +if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308) set(onnxruntime_USE_ACL ON) if (onnxruntime_USE_ACL_1902) add_definitions(-DACL_1902=1) @@ -1110,7 +1122,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 if (onnxruntime_USE_ACL_2002) add_definitions(-DACL_2002=1) else() - add_definitions(-DACL_1905=1) + if (onnxruntime_USE_ACL_2308) + add_definitions(-DACL_2308=1) + else() + add_definitions(-DACL_1905=1) + endif() endif() endif() endif() diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index e825bfeaea952..9f00c873715f4 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -300,6 +300,31 @@ if (MSVC) endif() else() if (NOT APPLE) + #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker + #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report + # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not + # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now. + if(NOT CMAKE_SYSTEM_PROCESSOR) + message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.") + # Try to detect it + if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") + execute_process( + COMMAND "${CMAKE_C_COMPILER}" -dumpmachine + OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE _err + RESULT_VARIABLE _res + ) + if(NOT _res EQUAL 0) + message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}") + endif() + string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}") + list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN) + if(GCC_TRIPLET_LEN EQUAL 4) + list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR) + message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}") + endif() + endif() + endif() set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR}) endif() if (onnxruntime_BUILD_FOR_NATIVE_MACHINE) diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake index 6f09583199ffd..f15d5b8dd6f80 100644 --- a/cmake/onnxruntime_optimizer.cmake +++ b/cmake/onnxruntime_optimizer.cmake @@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB) RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() + +if (onnxruntime_USE_ROCM) + add_dependencies(onnxruntime_optimizer generate_hipified_files) +endif() diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index b026369e12c80..5e38789b65137 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -383,6 +383,7 @@ Do not modify directly.* |Squeeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* squeezed:**T**

or

*in* data:**T**
*out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|StringConcat|*in* X:**T**
*in* Y:**T**
*out* Z:**T**|20+|**T** = tensor(string)| |StringNormalizer|*in* X:**tensor(string)**
*out* Y:**tensor(string)**|10+|**X** = tensor(string)| |Sub|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)| |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)| diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 06fef6bf72cc9..8cd0d0051d1eb 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -4528,6 +4528,19 @@ struct OrtApi { * \since Version 1.17. */ ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value); + + /** + * Run fn in parallel + * + * \param[in] context + * \param[in] fn Function accepting usr_data and an integer as iterator + * \param[in] total The number of times fn is to be invoked + * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit + * \param[in] usr_data User data to be passed back to fn + * + * \since Version 1.17. + */ + ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data); }; /* diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 16d9451624533..3773a01cb65a8 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -2057,6 +2057,7 @@ struct KernelContext { Logger GetLogger() const; OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const; OrtKernelContext* GetOrtKernelContext() const { return ctx_; } + void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const; private: OrtKernelContext* ctx_; diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 63e55603736b6..db4619eeeae62 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -1658,6 +1658,10 @@ inline Logger KernelContext::GetLogger() const { return Logger{out}; } +inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const { + ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data)); +} + inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) { Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_)); } diff --git a/js/node/package-lock.json b/js/node/package-lock.json index c1cf8af4bb80e..542eebe746d59 100644 --- a/js/node/package-lock.json +++ b/js/node/package-lock.json @@ -336,9 +336,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz", - "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==", + "version": "1.15.4", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", + "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", "dev": true, "funding": [ { @@ -1242,9 +1242,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz", - "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==", + "version": "1.15.4", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", + "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", "dev": true }, "form-data": { diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts index fb714bf5996f1..b3868871a4753 100644 --- a/js/web/lib/build-def.d.ts +++ b/js/web/lib/build-def.d.ts @@ -18,6 +18,10 @@ interface BuildDefinitions { * defines whether to disable the whole WebGpu backend in the build. */ readonly DISABLE_WEBGPU: boolean; + /** + * defines whether to disable the whole WebNN backend in the build. + */ + readonly DISABLE_WEBNN: boolean; /** * defines whether to disable the whole WebAssembly backend in the build. */ diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts index 499327741c82b..4f1a3943de69a 100644 --- a/js/web/lib/index.ts +++ b/js/web/lib/index.ts @@ -28,7 +28,9 @@ if (!BUILD_DEFS.DISABLE_WASM) { registerBackend('wasm', wasmBackend, 10); if (BUILD_DEFS.DISABLE_TRAINING) { registerBackend('xnnpack', wasmBackend, 9); - registerBackend('webnn', wasmBackend, 9); + if (!BUILD_DEFS.DISABLE_WEBNN) { + registerBackend('webnn', wasmBackend, 9); + } } } diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts index 8e1ec782079be..90e02da986b8f 100644 --- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts +++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts @@ -2,7 +2,7 @@ // Licensed under the MIT License. import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax'; -import {attention, parseAttentionAttributes} from './ops/attention'; +import {attention} from './ops/attention'; import {batchNorm} from './ops/batch-norm'; import {biasAdd} from './ops/bias-add'; import {biasSplitGelu} from './ops/bias-split-gelu'; @@ -16,11 +16,11 @@ import {expand} from './ops/expand'; import {gather, parseGatherAttributes} from './ops/gather'; import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements'; import {gemm, parseGemmAttributes} from './ops/gemm'; -import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm'; -import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm'; +import {instanceNorm} from './ops/instance-norm'; +import {layerNorm} from './ops/layer-norm'; import {matMul} from './ops/matmul'; import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion'; -import {pad, parsePadAttributes} from './ops/pad'; +import {pad} from './ops/pad'; import * as pool from './ops/pool'; import {range} from './ops/range'; import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce'; @@ -50,7 +50,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['Asinh', [unaryOps.asinh]], ['Atan', [unaryOps.atan]], ['Atanh', [unaryOps.atanh]], - ['Attention', [attention, parseAttentionAttributes]], + ['Attention', [attention]], // TODO: support new attributes for AveragePool-10 ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]], ['BatchNormalization', [batchNorm]], @@ -82,8 +82,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]], ['Greater', [binaryOps.greater]], ['GreaterOrEqual', [binaryOps.greaterOrEqual]], - ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]], - ['LayerNormalization', [layerNorm, parseLayerNormAttributes]], + ['InstanceNormalization', [instanceNorm]], + ['LayerNormalization', [layerNorm]], ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]], ['Less', [binaryOps.less]], ['LessOrEqual', [binaryOps.lessOrEqual]], @@ -95,7 +95,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['MultiHeadAttention', [multiHeadAttention, parseMultiHeadAttentionAttributes]], ['Neg', [unaryOps.neg]], ['Not', [unaryOps.not]], - ['Pad', [pad, parsePadAttributes]], + ['Pad', [pad]], ['Pow', [binaryOps.pow]], ['Range', [range]], ['Reciprocal', [unaryOps.reciprocal]], diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts index e1f2a47301bfb..ef8038dff487e 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts @@ -1,11 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +import {tensorDataTypeEnumToString} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; -import {createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, GpuDataType} from '../types'; +import {ComputeContext, GpuDataType, ProgramUniform} from '../types'; -import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common'; +import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, tensorTypeToWsglValueType, UniformDataElementType, UniformsArrayType} from './common'; export const enum AttentionQkvFormat { unknown, // enum value not set, or depends on qkv projection implementation details @@ -231,20 +231,8 @@ const validateAttentionInputs = (inputs: readonly TensorView[], attributes: Atte }; }; -export const parseAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs => - createAttributeWithCacheKey({...attributes}); - export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView, n: number, d: number) => { const components = getMaxComponents(d); - const inputHelper = outputVariable('x', input.dataType, input.dims, components); - - let threadMaxValue = 'threadMaxVector'; - if (components === 2) { - threadMaxValue = 'max(threadMaxVector.x, threadMaxVector.y)'; - } else if (components === 4) { - threadMaxValue = 'max(max(threadMaxVector.x, threadMaxVector.y), max(threadMaxVector.z, threadMaxVector.w))'; - } - const dataType = tensorTypeToWsglStorageType(input.dataType); let WG = 64; const dComp = d / components; if (dComp < WG) { @@ -253,25 +241,41 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView WG = Math.ceil(dComp / 8); } const elementsPerWG = Math.ceil(d / components / WG); + const tensorDataType = tensorDataTypeEnumToString(input.dataType) as ProgramUniform['type']; + const programUniforms: ProgramUniform[] = + [{type: tensorDataType, data: 1 / d}, {type: 'uint32', data: dComp}, {type: 'uint32', data: elementsPerWG}]; + const dataType = tensorTypeToWsglStorageType(input.dataType, components); + + const getShaderSource = (shaderHelper: ShaderHelper) => { + const inputHelper = outputVariable('x', input.dataType, input.dims, components); + let threadMaxValue = 'thread_max_vector'; + if (components === 2) { + threadMaxValue = 'max(thread_max_vector.x, thread_max_vector.y)'; + } else if (components === 4) { + threadMaxValue = + 'max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))'; + } + const elemValueType = tensorTypeToWsglValueType(input.dataType); + const uniforms: UniformsArrayType = [ + {name: 'd_inv', type: elemValueType as UniformDataElementType}, {name: 'd_comp', type: 'u32'}, + {name: 'elements_per_wg', type: 'u32'} + ]; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const dInv: ${dataType} = 1 / ${d}; - const dComp = ${d / components}; + return ` var wgMax: array; var wgSum: array; - - ${shaderHelper.declareVariables(inputHelper)} - @compute @workgroup_size(${WG}, 1, 1) - fn main(@builtin(workgroup_id) workgroup_id : vec3, - @builtin(local_invocation_index) local_index : u32) { - let localOffset = local_index * ${elementsPerWG}; - let offset: u32 = workgroup_id.x * dComp + localOffset; - - var threadMaxVector = ${fillVector('f32', components, '-3.402823e+38f')}; - for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) { - threadMaxVector = max(${castToF32(dataType, components, 'x[offset + i]')}, threadMaxVector); + ${shaderHelper.registerUniforms(uniforms).declareVariables(inputHelper)} + ${shaderHelper.mainStart([ + WG, 1, 1 + ])} + let localOffset = local_idx * uniforms.elements_per_wg; + let offset: u32 = workgroup_id.x * uniforms.d_comp + localOffset; + + var thread_max_vector = ${fillVector('f32', components, '-3.402823e+38f')}; + for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) { + thread_max_vector = max(${castToF32(elemValueType, components, 'x[offset + i]')}, thread_max_vector); } - wgMax[local_index] = ${threadMaxValue}; + wgMax[local_idx] = ${threadMaxValue}; workgroupBarrier(); var maxValue = -3.402823e+38f; @@ -280,10 +284,10 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView } var sumVector = ${fillVector('f32', components, '0')}; - for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) { - sumVector += exp(${castToF32(dataType, components, 'x[offset + i]')} - maxValue); + for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) { + sumVector += exp(${castToF32(elemValueType, components, 'x[offset + i]')} - maxValue); } - wgSum[local_index] = ${sumVector('sumVector', components)}; + wgSum[local_idx] = ${sumVector('sumVector', components)}; workgroupBarrier(); var sum: f32 = 0; @@ -292,26 +296,24 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView } if (sum == 0) { - for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) { - x[offset + i] = ${fillVector(dataType, components, 'dInv')}; + for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) { + x[offset + i] = ${fillVector('f32', components, 'uniforms.d_inv')}; } } else { - for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) { - let f32input = ${castToF32(dataType, components, 'x[offset + i]')}; + for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) { + let f32input = ${castToF32(elemValueType, components, 'x[offset + i]')}; x[offset + i] = ${inputHelper.type.value}(exp(f32input - maxValue) / sum); } } }`; + }; context.compute( { name: 'AttentionProbsSoftmax', - shaderCache: {hint: `${d}`}, + shaderCache: {hint: `${WG};${dataType};${components}`}, getShaderSource, - getRunData: () => ({ - outputs: [], - dispatchGroup: {x: n}, - }), + getRunData: () => ({outputs: [], dispatchGroup: {x: n}, programUniforms}), }, {inputs: [input], outputs: []}); }; @@ -326,47 +328,43 @@ const computeAttentionProbs = // TODO: handle mask const alpha = attributes.scale === 0 ? 1.0 / Math.sqrt(parameters.headSize) : attributes.scale; - - const dataType = tensorTypeToWsglStorageType(q.dataType); - const components = getMaxComponents(parameters.headSize); - const qInput = inputVariable('q', q.dataType, q.dims, components); - const kInput = inputVariable('key', key.dataType, key.dims, components); - const output = outputVariable('output', q.dataType, probsShape); - const vectorizedHeadSize = parameters.headSize / components; - const M = parameters.sequenceLength; - const N = parameters.totalSequenceLength; - const K = vectorizedHeadSize; - const TILE_SIZE = 12; - const dispatch = { x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE), y: Math.ceil(parameters.sequenceLength / TILE_SIZE), z: parameters.batchSize * parameters.numHeads }; + const tensorDataType = tensorDataTypeEnumToString(q.dataType) as ProgramUniform['type']; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: parameters.sequenceLength}, {type: 'uint32', data: vectorizedHeadSize}, + {type: 'uint32', data: parameters.totalSequenceLength}, {type: 'uint32', data: parameters.kvSequenceLength}, + {type: tensorDataType, data: alpha} + ]; const inputs = [q, key]; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const M: u32 = ${M}u; - const N: u32 = ${N}u; - const K: u32 = ${K}u; - const alpha: ${dataType} = ${alpha}; + + const getShaderSource = (shaderHelper: ShaderHelper) => { + const qInput = inputVariable('q', q.dataType, q.dims, components); + const kInput = inputVariable('key', key.dataType, key.dims, components); + const output = outputVariable('output', q.dataType, probsShape); + const dataType = tensorTypeToWsglStorageType(q.dataType); + + const uniforms: UniformsArrayType = [ + {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, + {name: 'kv_sequence_length', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType} + ]; + return ` const beta: ${dataType} = 1.0; const TILE_SIZE = ${TILE_SIZE}u; var tileQ: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>; var tileK: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>; - - ${shaderHelper.declareVariables(qInput, kInput, output)} - - @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1) - fn main(@builtin(workgroup_id) workgroup_id : vec3, - @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) { - let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u + - workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index; - + ${shaderHelper.registerUniforms(uniforms).declareVariables(qInput, kInput, output)} + ${shaderHelper.mainStart([ + TILE_SIZE, TILE_SIZE, 1 + ])} // x holds the N and y holds the M let headIdx = workgroup_id.z; let m = workgroup_id.y * TILE_SIZE; @@ -374,40 +372,42 @@ const computeAttentionProbs = let lm = m + local_id.y; let ln = n + local_id.x; - let qOffset = ${parameters.sequenceLength * vectorizedHeadSize} * headIdx + m * K; - let kOffset = ${parameters.kvSequenceLength * vectorizedHeadSize} * headIdx + n * K; + let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K; + let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K; var value = ${fillVector(dataType, components)}; - for (var w: u32 = 0u; w < K; w += TILE_SIZE) { - if (m + local_id.y < M && w + local_id.x < K) { - tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * K + w + local_id.x]; + for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) { + if (m + local_id.y < uniforms.M && w + local_id.x < uniforms.K) { + tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x]; } - if (n + local_id.y < N && w + local_id.x < K) { - tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * K + w + local_id.x]; + if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) { + tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * uniforms.K + w + local_id.x]; } workgroupBarrier(); - for (var k: u32 = 0u; k ({ outputs: [{dims: probsShape, dataType: q.dataType, gpuDataType: GpuDataType.default}], dispatchGroup: dispatch, + programUniforms }), getShaderSource, }, @@ -423,78 +423,76 @@ const computeAttentionProbs = const computeVxAttentionScore = (context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => { const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize]; - - const probsHelper = inputVariable('probs', probs.dataType, probs.dims); - const vHelper = inputVariable('v', v.dataType, v.dims); - const output = outputVariable('output', probs.dataType, outputShape); - - const dataType = tensorTypeToWsglStorageType(probs.dataType); - const TILE_SIZE = 12; const dispatch = { x: Math.ceil(params.vHeadSize / TILE_SIZE), y: Math.ceil(params.sequenceLength / TILE_SIZE), z: params.batchSize * params.numHeads }; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: params.sequenceLength}, {type: 'uint32', data: params.totalSequenceLength}, + {type: 'uint32', data: params.vHeadSize}, {type: 'uint32', data: params.numHeads}, + {type: 'uint32', data: params.vHiddenSize} + ]; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const M: u32 = ${params.sequenceLength}u; - const N: u32 = ${params.vHeadSize}u; - const K: u32 = ${params.totalSequenceLength}u; - const numHeads: u32 = ${params.numHeads}u; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const probsHelper = inputVariable('probs', probs.dataType, probs.dims); + const vHelper = inputVariable('v', v.dataType, v.dims); + const output = outputVariable('output', probs.dataType, outputShape); + const uniforms: UniformsArrayType = [ + {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, + {name: 'num_heads', type: 'u32'}, {name: 'v_hidden_size', type: 'u32'} + ]; + return ` const TILE_SIZE = ${TILE_SIZE}u; - - var tileQ: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>; - var tileK: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>; - - ${shaderHelper.declareVariables(probsHelper, vHelper, output)} - - @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1) - fn main(@builtin(workgroup_id) workgroup_id : vec3, - @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) { - let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u + - workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index; - + var tileQ: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>; + var tileK: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>; + ${shaderHelper.registerUniforms(uniforms).declareVariables(probsHelper, vHelper, output)} + ${shaderHelper.mainStart([ + TILE_SIZE, TILE_SIZE, 1 + ])} let headIdx = workgroup_id.z; let m = workgroup_id.y * TILE_SIZE + local_id.y; let n = workgroup_id.x * TILE_SIZE + local_id.x; - let offsetA = headIdx * (M * K) + m * K; - let offsetB = headIdx * (N * K) + n; + let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K; + let offsetB = headIdx * (uniforms.N * uniforms.K) + n; - var value = ${dataType}(0); - for (var w: u32 = 0u; w < K; w += TILE_SIZE) { - if (m < M && w + local_id.x < K) { + var value = ${probsHelper.type.storage}(0); + for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) { + if (m < uniforms.M && w + local_id.x < uniforms.K) { tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x]; } - if (n < N && w + local_id.y < K) { - tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * N]; + if (n < uniforms.N && w + local_id.y < uniforms.K) { + tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * uniforms.N]; } workgroupBarrier(); - for (var k: u32 = 0u; k ({ outputs: [{dims: outputShape, dataType: probs.dataType, gpuDataType: GpuDataType.default}], dispatchGroup: dispatch, + programUniforms }), getShaderSource, }, @@ -517,71 +515,71 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => { parameters.sequenceLength, parameters.headSize, ]; - - const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType); - const M = parameters.sequenceLength; const K = parameters.inputHiddenSize; const N = parameters.headSize; - const TILE_SIZE = 12; const dispatch = { x: Math.ceil(parameters.headSize / TILE_SIZE), y: Math.ceil(parameters.sequenceLength / TILE_SIZE), z: parameters.batchSize * parameters.numHeads }; + const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]]; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: M}, {type: 'uint32', data: K}, {type: 'uint32', data: N}, + {type: 'uint32', data: parameters.numHeads}, {type: 'uint32', data: parameters.headSize}, + {type: 'uint32', data: parameters.hiddenSize}, + {type: 'uint32', data: parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize} + ]; - const getShaderSource = () => ` - const M: u32 = ${M}u; - const K: u32 = ${K}u; - const N: u32 = ${N}u; - const numHeads: u32 = ${parameters.numHeads}; - const ldb = ${parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}u; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const outputQ = outputVariable('output_q', inputs[0].dataType, outputShape); + const outputK = outputVariable('output_k', inputs[0].dataType, outputShape); + const outputV = outputVariable('output_v', inputs[0].dataType, outputShape); + const input = inputVariable('input', inputs[0].dataType, inputs[0].dims); + const weight = inputVariable('weight', inputs[1].dataType, inputs[1].dims); + const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims); + const dataType = input.type.storage; + + const uniforms: UniformsArrayType = [ + {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'num_heads', type: 'u32'}, + {name: 'head_size', type: 'u32'}, {name: 'hidden_size', type: 'u32'}, {name: 'ldb', type: 'u32'} + ]; + return ` const TILE_SIZE = ${TILE_SIZE}u; - var tileInput: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>; var tileWeightQ: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>; var tileWeightK: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>; var tileWeightV: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>; - - @group(0) @binding(0) var input: array<${dataType}>; - @group(0) @binding(1) var weight: array<${dataType}>; - @group(0) @binding(2) var bias: array<${dataType}>; - @group(0) @binding(3) var outputQ: array<${dataType}>; - @group(0) @binding(4) var outputK: array<${dataType}>; - @group(0) @binding(5) var outputV: array<${dataType}>; - - @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1) - fn main(@builtin(workgroup_id) workgroup_id : vec3, - @builtin(local_invocation_id) local_id : vec3, @builtin(local_invocation_index) local_index : u32) { - let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u + - workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index; - - let batchIndex = workgroup_id.z / ${parameters.numHeads}; - let headNumber = workgroup_id.z % ${parameters.numHeads}; + ${shaderHelper.registerUniforms(uniforms).declareVariables(input, weight, bias, outputQ, outputK, outputV)} + ${shaderHelper.mainStart([ + TILE_SIZE, TILE_SIZE, 1 + ])} + let batchIndex = workgroup_id.z / uniforms.num_heads; + let headNumber = workgroup_id.z % uniforms.num_heads; let m = workgroup_id.y * TILE_SIZE + local_id.y; let n = workgroup_id.x * TILE_SIZE + local_id.x; - let inputOffset = batchIndex * (M * K) + m * K; - let biasOffsetQ = headNumber * ${parameters.headSize}; - let biasOffsetK = ${parameters.hiddenSize} + biasOffsetQ; - let biasOffsetV = ${parameters.hiddenSize} + biasOffsetK; + let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K; + let biasOffsetQ = headNumber * uniforms.head_size; + let biasOffsetK = uniforms.hidden_size + biasOffsetQ; + let biasOffsetV = uniforms.hidden_size + biasOffsetK; var valueQ = ${dataType}(0); var valueK = ${dataType}(0); var valueV = ${dataType}(0); - for (var w: u32 = 0u; w < K; w += TILE_SIZE) { - if (m < M && w + local_id.x < K) { + for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) { + if (m < uniforms.M && w + local_id.x < uniforms.K) { tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x]; } - if (n < N && w + local_id.y < K) { - let offset = n + (w + local_id.y) * ldb; + if (n < uniforms.N && w + local_id.y < uniforms.K) { + let offset = n + (w + local_id.y) * uniforms.ldb; tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset]; tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset]; tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset]; } workgroupBarrier(); - for (var k: u32 = 0u; k { workgroupBarrier(); } - let headOffset = (m * N + n) % ${parameters.headSize}; + let headOffset = (m * uniforms.N + n) % uniforms.head_size; valueQ += bias[headOffset + biasOffsetQ]; valueK += bias[headOffset + biasOffsetK]; valueV += bias[headOffset + biasOffsetV]; - let offset = workgroup_id.z * M * N; - if (m < M && n < N) { - let outputIdx = offset + m * N + n; - outputQ[outputIdx] = valueQ; - outputK[outputIdx] = valueK; - outputV[outputIdx] = valueV; + let offset = workgroup_id.z * uniforms.M * uniforms.N; + if (m < uniforms.M && n < uniforms.N) { + let outputIdx = offset + m * uniforms.N + n; + output_q[outputIdx] = valueQ; + output_k[outputIdx] = valueK; + output_v[outputIdx] = valueV; } }`; - - const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]]; + }; return context.compute( { name: 'AttentionPrepare', - shaderCache: {hint: JSON.stringify(parameters)}, + shaderCache: {inputDependencies: ['type', 'type', 'type']}, getRunData: () => ({ outputs: [ {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default}, @@ -619,6 +616,7 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => { {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default}, ], dispatchGroup: dispatch, + programUniforms }), getShaderSource, }, diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts index 3ce114c5d3884..bc3265be955f0 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts @@ -780,8 +780,10 @@ class ShaderHelperImpl implements ShaderHelper { const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1; const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3, + @builtin(workgroup_id) workgroup_id : vec3, @builtin(local_invocation_id) local_id : vec3` : - `@builtin(local_invocation_index) local_idx : u32, + `@builtin(local_invocation_id) local_id : vec3, + @builtin(local_invocation_index) local_idx : u32, @builtin(workgroup_id) workgroup_id : vec3, @builtin(num_workgroups) num_workgroups : vec3`; const globalIdxDefinition = is1DimensionDispatch ? diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts index 14482272bad38..21b4953d3f90c 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts @@ -3,9 +3,9 @@ import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; -import {ProgramInfo} from '../types'; +import {ProgramInfo, ProgramUniform} from '../types'; -import {inputVariable, outputVariable, ShaderHelper} from './common'; +import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common'; import {calculateOutputShape, ConvAttributes} from './conv'; import {getActivationSnippet} from './fuse-utils'; @@ -95,3 +95,98 @@ export const createGroupedConvProgramInfo = getShaderSource, }; }; + +export const createGroupedConvVectorizeProgramInfo = + (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => { + const hasBias = inputs.length > 2; + const components = getMaxComponents(outputShape[3]); + const outputNumber = getMaxComponents(outputShape[2]); + const outputSize = ShapeUtil.size(outputShape) / components / outputNumber; + const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components]; + const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components]; + const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components]; + + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: outputSize}, {type: 'int32', data: attributes.strides}, + {type: 'int32', data: attributes.pads}, ...createTensorShapeVariables(xShape), + ...createTensorShapeVariables(wShape), ...createTensorShapeVariables(outputShapeInShader) + ]; + const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1]; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components); + const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value); + const x = inputVariable('x', inputs[0].dataType, xShape.length, components); + const w = inputVariable('w', inputs[1].dataType, wShape.length, components); + const inputVars = [x, w]; + if (hasBias) { + inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components)); + } + const processBias = hasBias ? 'value += b[output_channel];' : ''; + + return ` + ${ + shaderHelper.registerUniform('output_size', 'u32') + .registerUniform('strides', 'i32', 2) + .registerUniform('pads', 'i32', 2) + .declareVariables(...inputVars, output)} + ${activationFunction} + ${shaderHelper.mainStart()} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} + let width0 = uniforms.output_shape[3]; + let output_channel = global_idx % width0; + var index1 = global_idx / width0; + let width1 = uniforms.output_shape[2] / ${outputNumber}u; + let col = (index1 % width1) * ${outputNumber}u; + index1 = index1 / width1; + let row = index1 % uniforms.output_shape[1]; + let batch = index1 / uniforms.output_shape[1]; + + let x_corner = vec2(i32(row), i32(col)) * uniforms.strides - uniforms.pads; + + var x_vals: array<${x.type.value}, ${xNumber}>; + var values: array<${output.type.value}, ${outputNumber}>; + let input_channel = output_channel; + // Use constant instead of uniform can give better performance for w's height/width. + for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) { + let x_height = x_corner.x + i32(w_height); + if (x_height >= 0 || u32(x_height) < uniforms.x_shape[1]) { + for (var i = 0; i < ${xNumber}; i++) { + let x_width = x_corner.y + i; + if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) { + x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')}; + } else { + x_vals[i] = ${x.type.value}(0); + } + } + for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) { + let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')}; + for (var i = 0u; i < ${outputNumber}u; i++) { + values[i] = fma(x_vals[i * ${attributes.strides[1]}u + w_width], w_val, values[i]); + } + } + } + } + + for (var i = 0u; i < ${outputNumber}u; i++) { + var value = values[i]; + ${processBias} + ${applyActivation} + ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')}; + } + }`; + }; + + return { + name: 'GroupedConv-Vectorize', + shaderCache: { + hint: `${attributes.activationCacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`, + inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank'] + }, + getRunData: () => ({ + outputs: [{dims: outputShape, dataType: inputs[0].dataType}], + dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, + programUniforms + }), + getShaderSource, + }; + }; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts index 33a5db7ff6b25..7af2c5db49f40 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts @@ -8,7 +8,7 @@ import {ComputeContext} from '../types'; import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu'; import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu'; -import {createGroupedConvProgramInfo} from './conv-grouped'; +import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped'; import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils'; import {createNaiveMatmulProgramInfo} from './matmul'; import {createTransposeProgramInfo} from './transpose'; @@ -136,12 +136,36 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut // check attributes // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */ + const isChannelsLast = attributes.format === 'NHWC'; if (attributes.group !== 1) { - context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes)); + // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases: + // [webgpu]Conv - conv - vectorize group - B + // [webgpu]Conv - conv - vectorize group - D + const disableGroupedConvVectorize = true; + if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group && + inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) { + const outputShape = calculateOutputShape( + inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides, + isChannelsLast); + const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ?? + context.compute( + createTransposeProgramInfo(inputs[1], weightTransposeAttribute), + {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0]; + if (attributes.wIsConst && !context.kernelCustomData.wT) { + context.kernelCustomData.wT = transposedWeight; + } + const convInputs = [inputs[0], transposedWeight]; + if (inputs.length === 3) { + convInputs.push(inputs[2]); + } + context.compute( + createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs}); + } else { + context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes)); + } return; } - const isChannelsLast = attributes.format === 'NHWC'; const hasBias = inputs.length === 3; const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2]; const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3]; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts index 1c5d28e4b8e3f..30754c84413b7 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts @@ -3,10 +3,10 @@ import {TensorView} from '../../tensor-view'; import {GemmUtil, ShapeUtil} from '../../util'; -import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, ProgramInfo} from '../types'; +import {AttributeWithCacheKey} from '../attribute-with-cache-key'; +import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types'; -import {ShaderHelper, tensorTypeToWsglStorageType} from './common'; +import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common'; const validateInputs = (inputs: readonly TensorView[]): void => { if (!inputs) { @@ -34,25 +34,6 @@ export interface GemmAttributes extends AttributeWithCacheKey { beta: number; } -const offsetC = (m: number, n: number, dims: readonly number[]): string => { - if (dims.length === 0) { - return '0u'; - } - - const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m); - const broadcastN = dims[dims.length - 1] !== n; - - let offset = '0u'; - if (!broadcastM) { - offset += `+ m * ${dims[dims.length - 1]}u`; - } - if (!broadcastN) { - offset += '+n'; - } - - return offset; -}; - const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => { const aShape = inputs[0].dims.slice(); const bShape = inputs[1].dims.slice(); @@ -63,68 +44,92 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt throw new Error('Can\'t use gemm on the given tensors'); } const outputSize = ShapeUtil.size(outputShape); - let line = ''; - if (attributes.transA && attributes.transB) { - line = 'value += a[k * M + m] * b[n * K + k];'; - } else if (attributes.transA && !attributes.transB) { - line = 'value += a[k * M + m] * b[k * N + n];'; - } else if (!attributes.transA && attributes.transB) { - line = 'value += a[m * K + k] * b[n * K + k];'; - } else if (!attributes.transA && !attributes.transB) { - line = 'value += a[m * K + k] * b[k * N + n];'; - } - - const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); - const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;'; - const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : ''; - const inputStorageBuffersDeclarations = [ - `@group(0) @binding(0) var a : array<${dataType}>;`, - `@group(0) @binding(1) var b : array<${dataType}>;` + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: outputSize}, {type: 'uint32', data: M}, {type: 'uint32', data: N}, {type: 'uint32', data: K}, + {type: 'float32', data: attributes.alpha}, {type: 'float32', data: attributes.beta} ]; + const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type']; if (inputs.length === 3) { - inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var c : array<${dataType}>;`); + programUniforms.push(...createTensorShapeVariables(inputs[2].dims)); + inputDependencies.push('rank'); } - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const M: u32 = ${M}u; - const N: u32 = ${N}u; - const K: u32 = ${K}u; - const alpha = ${dataType}(${attributes.alpha}); - const beta = ${dataType}(${attributes.beta}); + programUniforms.push(...createTensorShapeVariables(outputShape)); + + const getShaderSource = (shaderHelper: ShaderHelper) => { + let line = ''; + if (attributes.transA && attributes.transB) { + line = 'value += a[k * uniforms.M + m] * b[n * uniforms.K + k];'; + } else if (attributes.transA && !attributes.transB) { + line = 'value += a[k * uniforms.M + m] * b[k * uniforms.N + n];'; + } else if (!attributes.transA && attributes.transB) { + line = 'value += a[m * uniforms.K + k] * b[n * uniforms.K + k];'; + } else if (!attributes.transA && !attributes.transB) { + line = 'value += a[m * uniforms.K + k] * b[k * uniforms.N + n];'; + } - ${inputStorageBuffersDeclarations.join('\n')} - @group(0) @binding(${inputs.length}) var output : array<${dataType}>; + const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= uniforms.alpha;'; + const a = inputVariable('a', inputs[0].dataType, inputs[0].dims); + const b = inputVariable('b', inputs[1].dataType, inputs[1].dims); + const dataType = a.type.value; + let c: IndicesHelper|null = null; + const variables = [a, b]; + if (inputs.length === 3) { + c = inputVariable('c', inputs[2].dataType, inputs[2].dims.length); + variables.push(c); + } + const output = outputVariable('output', inputs[0].dataType, outputShape.length); + variables.push(output); + const uniforms: UniformsArrayType = [ + {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'K', type: 'u32'}, + {name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'} + ]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} - let m = global_idx / N; - let n = global_idx % N; + let m = global_idx / uniforms.N; + let n = global_idx % uniforms.N; var value = ${dataType}(0); - for (var k: u32 = 0u; k<${K}u; k++) { + for (var k: u32 = 0u; k < uniforms.K; k++) { ${line} } ${calculateAlpha} - ${calculateC} + ${(() => { + if (c != null) { + return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${ + c.getByOffset('cOffset')};`; + } + return ''; + })()} output[global_idx] = value; - }`; + }; + return { name: 'Gemm', - shaderCache: {hint: attributes.cacheKey}, + shaderCache: {hint: `${attributes.cacheKey}`, inputDependencies}, getRunData: () => ({ outputs: [{dims: outputShape, dataType: inputs[0].dataType}], - dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)} + dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, + programUniforms }), getShaderSource, }; }; +export const parseGemmAttributes = (attributes: Record): GemmAttributes => { + const transA = attributes.transA as boolean; + const transB = attributes.transB as boolean; + const alpha = attributes.alpha as number; + const beta = attributes.beta as number; + return {transA, transB, alpha, beta, cacheKey: `${attributes.transA};${attributes.transB};${attributes.alpha === 1}`}; +}; + export const gemm = (context: ComputeContext, attributes: GemmAttributes): void => { validateInputs(context.inputs); context.compute(createGemmProgramInfo(context.inputs, attributes)); }; - -export const parseGemmAttributes = (attributes: Record): GemmAttributes => - createAttributeWithCacheKey(attributes as Omit); diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts index 3a84844544c96..056dd54d54591 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts @@ -4,58 +4,56 @@ import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; -import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, ProgramInfo} from '../types'; +import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types'; -import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common'; +import {createTensorShapeVariables, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common'; -export interface InstanceNormAttributes extends AttributeWithCacheKey { +export interface InstanceNormAttributes { epsilon: number; format: 'NHWC'|'NCHW'; } -const metadata = { - name: 'InstanceNormalization' -}; - const createInstanceNormProgramInfo = (inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => { const xShape = inputs[0].dims; - const outputShape = xShape; const axis = 2; const normCount = ShapeUtil.sizeToDimension(xShape, axis); const normSize = ShapeUtil.sizeFromDimension(xShape, axis); const components = getMaxComponents(normSize); const normPackedSize = normSize / components; - const C = xShape[1]; - const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components); - const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims); - const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims); - const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components); - const variables = [x, scale, bias, output]; - const dataType = x.type.value; - const f32Type = components === 1 ? 'f32' : `vec${components}`; - const workgroupSize = 64; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - - const C: u32 = ${C}; - const normSize: u32 = ${normSize}; - const epsilon: f32 = ${attributes.epsilon}; + const inputShape = [xShape[0], xShape[1], normPackedSize]; + const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'type', 'type']; + const programUniforms: ProgramUniform[] = + [{type: 'uint32', data: normSize}, {type: 'uint32', data: normPackedSize}]; + programUniforms.push(...createTensorShapeVariables(inputShape), ...createTensorShapeVariables(inputShape)); + + const getShaderSource = (shaderHelper: ShaderHelper) => { + const x = inputVariable('x', inputs[0].dataType, inputShape.length, components); + const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims); + const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims); + const output = outputVariable('output', inputs[0].dataType, inputShape.length, components); + const variables = [x, scale, bias, output]; + const dataType = x.type.value; + const f32Type = components === 1 ? 'f32' : `vec${components}`; + const workgroupSize = 64; + + const uniforms: UniformsArrayType = [{name: 'normSize', type: 'u32'}, {name: 'normPackedSize', type: 'u32'}]; + return ` var meanShared : f32; var squaredNormShared : f32; var workgroupShared : array<${f32Type}, ${workgroupSize}>; const workgroupSize = ${workgroupSize}u; - ${shaderHelper.declareVariables(...variables)} + ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)} ${shaderHelper.mainStart(workgroupSize)} let norm = global_idx / workgroupSize; - let batch = norm / C; - let channel = norm % C; + let batch = norm / uniforms.x_shape[1]; + let channel = norm % uniforms.x_shape[1]; let localIndex = local_id.x; // initialize workgroup memory var initial = ${f32Type}(0); - for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) { + for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) { initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')}); } workgroupShared[localIndex] = initial; @@ -69,13 +67,13 @@ const createInstanceNormProgramInfo = workgroupBarrier(); } if (localIndex == 0) { - meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize); + meanShared = ${sumVector('workgroupShared[0]', components)} / f32(uniforms.normSize); } workgroupBarrier(); // reinitialize workgroup memory. initial = ${f32Type}(0); - for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) { + for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) { let deviation = ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared); initial = initial + deviation * deviation; } @@ -94,23 +92,26 @@ const createInstanceNormProgramInfo = } workgroupBarrier(); - let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon); + let invStdDev = 1 / sqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon})); let channelScale = invStdDev * f32(${scale.getByOffset('channel')}); let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale; - for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) { + for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) { let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${ - f32Type}(channelShift)); + f32Type}(channelShift)); ${output.set('batch', 'channel', 'h', 'value')}; } }`; + }; return { - ...metadata, - shaderCache: {hint: attributes.cacheKey}, + ...{name: 'InstanceNormalization'}, + // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon. + shaderCache: {hint: `${attributes.epsilon};${components}`, inputDependencies}, getRunData: () => ({ outputs: [ {dims: outputShape, dataType: inputs[0].dataType}, ], - dispatchGroup: {x: normCount} + dispatchGroup: {x: normCount}, + programUniforms }), getShaderSource, }; @@ -120,10 +121,6 @@ const computeMean = (context: ComputeContext, input: TensorView, scale: TensorView, bias: TensorView, n: number, h: number, c: number, epsilon: number) => { const components = getMaxComponents(c); - const inputHelper = inputVariable('input', input.dataType, input.dims, components); - const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components); - const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components); - const WG = 64; // we will store channel scale and channel shift in [2, components] matrix // or in vec2 when components == 1 @@ -133,65 +130,79 @@ const computeMean = const unitsOfWork = n * c / components; const wgSize = Math.ceil(h / WG); - const getMeanShaderSource = (shaderHelper: ShaderHelper) => ` - const H: u32 = ${h}; - const C: u32 = ${c / components}; - const imageSize: u32 = ${h * c / components}; + const meanInputDependencies: ProgramInputTensorInfoDependency[] = ['type']; + const meanProgramUniforms: ProgramUniform[] = [ + {type: 'uint32', data: wgSize}, {type: 'uint32', data: h}, {type: 'uint32', data: Math.floor(c / components)}, + {type: 'uint32', data: Math.floor(h * c / components)} + ]; + const getMeanShaderSource = (shaderHelper: ShaderHelper) => { + const inputHelper = inputVariable('input', input.dataType, input.dims, components); + return ` ${shaderHelper.declareVariables(inputHelper)} @group(0) @binding(1) var output : array<${outputType}>; + struct Uniforms {wg_size:u32, H:u32, C:u32, image_size:u32}; + @group(0) @binding(2) var uniforms: Uniforms; ${shaderHelper.mainStart(WG)} - let currentImageNumber = global_idx / ${WG} / C; - let currentChannelNumber = (global_idx / ${WG}) % C; + let currentImageNumber = global_idx / ${WG} / uniforms.C; + let currentChannelNumber = (global_idx / ${WG}) % uniforms.C; let wgId = global_idx % ${WG}; - let wgOffset = wgId * ${wgSize}; - if (wgOffset >= H) { + let wgOffset = wgId * uniforms.wg_size; + if (wgOffset >= uniforms.H) { return; } - let wgMax = min(wgOffset + ${wgSize}, H); + let wgMax = min(wgOffset + uniforms.wg_size, uniforms.H); - let offset = currentImageNumber * imageSize + currentChannelNumber; + let offset = currentImageNumber * uniforms.image_size + currentChannelNumber; var sum = ${fillVector('f32', components)}; var squaredSum = ${fillVector('f32', components)}; for (var i: u32 = wgOffset; i < wgMax; i++) { - let value = ${sumCastType}(input[offset + i * C]); + let value = ${sumCastType}(input[offset + i * uniforms.C]); sum += value; squaredSum += value * value; } output[global_idx] = ${setOutputValue('sum', 'squaredSum')}; }`; + }; const meanValues = context.compute( { name: 'InstanceNormComputeMean', - shaderCache: {hint: JSON.stringify({components, n, h, c})}, + shaderCache: {hint: `${components}`, inputDependencies: meanInputDependencies}, getRunData: () => ({ outputs: [ {dims: [n, c, WG, 2], dataType: DataType.float}, ], dispatchGroup: {x: n * c / components}, + programUniforms: meanProgramUniforms }), getShaderSource: getMeanShaderSource, }, {inputs: [input], outputs: [-1]})[0]; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const H: u32 = ${h}; - const C: u32 = ${c / components}; - const imageSize: u32 = ${WG * c / components}; - const epsilon: f32 = ${epsilon}; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: unitsOfWork}, {type: 'uint32', data: h}, + {type: 'uint32', data: Math.floor(c / components)}, {type: 'uint32', data: Math.floor(WG * c / components)} + ]; + const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type', 'type']; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components); + const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components); + return ` @group(0) @binding(0) var input : array<${outputType}>; @group(0) @binding(1) var scale : array<${scaleHelper.type.storage}>; @group(0) @binding(2) var bias : array<${biasHelper.type.storage}>; @group(0) @binding(3) var output : array<${outputType}>; + struct Uniforms {units_of_work : u32, H: u32, C : u32, image_size : u32}; + @group(0) @binding(4) var uniforms: Uniforms; ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(unitsOfWork)} - let currentImageNumber = global_idx / C; - let currentChannelNumber = global_idx % C; + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.units_of_work')} + let currentImageNumber = global_idx / uniforms.C; + let currentChannelNumber = global_idx % uniforms.C; - let offset = currentImageNumber * imageSize; + let offset = currentImageNumber * uniforms.image_size; var sum = ${fillVector('f32', components)}; var squaredSum = ${fillVector('f32', components)}; for (var i: u32 = 0; i < ${WG}; i++) { @@ -199,24 +210,26 @@ const computeMean = sum += value[0]; squaredSum += value[1]; } - sum = sum / f32(H); - squaredSum = squaredSum / f32(H); - let invStdDev = 1 / sqrt(squaredSum - sum * sum + epsilon); + sum = sum / f32(uniforms.H); + squaredSum = squaredSum / f32(uniforms.H); + let invStdDev = 1 / sqrt(squaredSum - sum * sum + f32(${epsilon})); let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]); let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale; output[global_idx] = ${setOutputValue('channelScale', 'channelShift')}; }`; - + }; return context.compute( { name: 'InstanceNormComputeChannelScaleShift', - shaderCache: {hint: JSON.stringify({components, n, h, c, epsilon})}, + // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon. + shaderCache: {hint: `${components};${epsilon}`, inputDependencies}, getRunData: () => ({ outputs: [ {dims: [n, c, 2], dataType: DataType.float}, ], dispatchGroup: {x: Math.ceil(unitsOfWork / 64 /* workgroup size */)}, + programUniforms }), getShaderSource, }, @@ -230,50 +243,51 @@ const createInstanceNormNHWCProgramInfo = const N = xShape[0]; const C = xShape[xShape.length - 1]; const H = ShapeUtil.sizeFromDimension(xShape, 1) / C; - const components = getMaxComponents(C); const outputSize = ShapeUtil.size(outputShape) / components; - const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components); - const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components); - - const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); - const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`; - const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`; + const programUniforms: ProgramUniform[] = + [{type: 'uint32', data: H}, {type: 'uint32', data: Math.floor(C / components)}]; + const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type']; // first compute mean const channelScaleShift = computeMean(context, inputs[0], inputs[1], inputs[2], N, H, C, attributes.epsilon); + const getShaderSource = (shaderHelper: ShaderHelper) => { + const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); + const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`; + const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`; - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const H: u32 = ${H}; - const C: u32 = ${C / components}; + const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components); + const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components); + return ` @group(0) @binding(0) var input : array<${inputHelper.type.storage}>; @group(0) @binding(1) var scaleInput : array<${scaleType}>; @group(0) @binding(2) var output : array<${outputHelper.type.storage}>; + struct Uniforms {H: u32, C : u32}; + @group(0) @binding(3) var uniforms: Uniforms; ${shaderHelper.mainStart()} - let currentImageNumber = global_idx / (C * H); - let currentChannelNumber = global_idx % C; + let currentImageNumber = global_idx / (uniforms.C * uniforms.H); + let currentChannelNumber = global_idx % uniforms.C; - let scaleOffset = currentImageNumber * C + currentChannelNumber; + let scaleOffset = currentImageNumber * uniforms.C + currentChannelNumber; let scale = scaleInput[scaleOffset]; output[global_idx] = fma(input[global_idx], ${scaleCastType}(scale[0]), ${scaleCastType}(scale[1])); }`; + }; context.compute( { - name: 'InstanceNormalization', - shaderCache: {hint: `${attributes.cacheKey}`}, + name: 'InstanceNormalizationNHWC', + shaderCache: {hint: `${components}`, inputDependencies}, getRunData: () => ({ outputs: [{dims: outputShape, dataType: inputs[0].dataType}], - dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)} + dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, + programUniforms }), getShaderSource, }, {inputs: [inputs[0], channelScaleShift]}); }; -export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): InstanceNormAttributes => - createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format}); - export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => { if (attributes.format === 'NHWC') { createInstanceNormNHWCProgramInfo(context, context.inputs, attributes); diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts index 8a9eeecf2c68d..bc446079faf8f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts @@ -4,12 +4,11 @@ import {DataType} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; -import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, ProgramInfo} from '../types'; +import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types'; -import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common'; +import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common'; -export interface LayerNormAttributes extends AttributeWithCacheKey { +interface LayerNormAttributes { axis: number; epsilon: number; } @@ -39,7 +38,7 @@ const createLayerNormProgramInfo = Got scale size of ${scaleSize} and bias size of ${biasSize}`); } - const meanInvStdDevDim = []; + const meanInvStdDevDim: number[] = []; for (let i = 0; i < xShape.length; ++i) { if (i < axis) { meanInvStdDevDim.push(xShape[i]); @@ -47,50 +46,57 @@ const createLayerNormProgramInfo = meanInvStdDevDim.push(1); } } - const components = getMaxComponents(normSize); - const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); - const variables = [ - inputVariable('x', inputs[0].dataType, inputs[0].dims, components), - inputVariable('scale', scale.dataType, scale.dims, components), + const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type']; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: normCount}, {type: 'float32', data: normSize}, + {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon} ]; if (bias) { - variables.push(inputVariable('bias', bias.dataType, bias.dims, components)); + inputDependencies.push('type'); } - variables.push(outputVariable('output', inputs[0].dataType, outputShape, components)); - const hasMeanDataOutput = outputCount > 1; const hasInvStdOutput = outputCount > 2; - if (hasMeanDataOutput) { - variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim)); - } - if (hasInvStdOutput) { - variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim)); - } - - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const normSize: f32 = ${normSize}; - const normSizeVectorized: u32 = ${normSize / components}; - const epsilon: f32 = ${attributes.epsilon}; + const getShaderSource = (shaderHelper: ShaderHelper) => { + const dataType = tensorTypeToWsglStorageType(inputs[0].dataType); + const variables = [ + inputVariable('x', inputs[0].dataType, inputs[0].dims, components), + inputVariable('scale', scale.dataType, scale.dims, components), + ]; + if (bias) { + variables.push(inputVariable('bias', bias.dataType, bias.dims, components)); + } + variables.push(outputVariable('output', inputs[0].dataType, outputShape, components)); + if (hasMeanDataOutput) { + variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim)); + } + if (hasInvStdOutput) { + variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim)); + } - ${shaderHelper.declareVariables(...variables)} + const uniforms: UniformsArrayType = [ + {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'}, + {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'} + ]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)} - let offset = global_idx * normSizeVectorized; + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')} + let offset = global_idx * uniforms.norm_size_vectorized; var meanVector = ${fillVector('f32', components)}; var meanSquareVector = ${fillVector('f32', components)}; - for (var h: u32 = 0u; h < normSizeVectorized; h++) { + for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) { let value = ${castToF32(dataType, components, 'x[h + offset]')}; meanVector += value; meanSquareVector += value * value; } - let mean = ${sumVector('meanVector', components)} / normSize; - let meanSquare = sqrt(${sumVector('meanSquareVector', components)} - / normSize - mean * mean + epsilon); + let mean = ${sumVector('meanVector', components)} / uniforms.norm_size; + let meanSquare = sqrt(${sumVector('meanSquareVector', components)} + / uniforms.norm_size - mean * mean + uniforms.epsilon); - for (var j: u32 = 0; j < normSizeVectorized; j++) { + for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) { let f32input = ${castToF32(dataType, components, 'x[j + offset]')}; let f32scale = ${castToF32(dataType, components, 'scale[j]')}; output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale @@ -98,9 +104,10 @@ const createLayerNormProgramInfo = ); } - ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''}; - ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''}; + ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''}; + ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''}; }`; + }; const outputs = [{dims: outputShape, dataType: inputs[0].dataType}]; if (hasMeanDataOutput) { outputs.push({dims: meanInvStdDevDim, dataType: DataType.float}); @@ -111,15 +118,13 @@ const createLayerNormProgramInfo = return { name: 'LayerNormalization', - shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`}, - getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}), + shaderCache: {hint: `${components};${outputCount}`, inputDependencies}, + getRunData: () => + ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}), getShaderSource, }; }; -export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes => - createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon}); - export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => { validateInputs(context.inputs); context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount)); diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts index b7726a36bcaad..6d22e3780efd9 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts @@ -4,10 +4,10 @@ import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; import {createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, GpuDataType} from '../types'; +import {ComputeContext, GpuDataType, ProgramUniform} from '../types'; import {applyAttention, AttentionAttrs, AttentionMaskType, AttentionParameters, AttentionQkvFormat} from './attention'; -import {ShaderHelper, tensorTypeToWsglStorageType} from './common'; +import {inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common'; import {createTransposeProgramInfo, TransposeAttributes} from './transpose'; const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttrs): AttentionParameters => { @@ -228,7 +228,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttr }; }; - export const parseMultiHeadAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs => createAttributeWithCacheKey({...attributes}); @@ -239,30 +238,35 @@ const addBiasTranspose = hiddenSize: number, biasOffset: number) => { const outputShape = [batchSize, sequenceLength, hiddenSize]; const outputSize = ShapeUtil.size(outputShape); - - const dataType = tensorTypeToWsglStorageType(qkv.dataType); - const getShaderSource = (shaderHelper: ShaderHelper) => ` - const biasOffset = ${biasOffset}u; - const hiddenSize = ${hiddenSize}u; - - @group(0) @binding(0) var qkv: array<${dataType}>; - @group(0) @binding(1) var bias: array<${dataType}>; - @group(0) @binding(2) var qkv_with_bias: array<${dataType}>; - + const programUniforms: ProgramUniform[] = + [{type: 'uint32', data: outputSize}, {type: 'uint32', data: biasOffset}, {type: 'uint32', data: hiddenSize}]; + + const getShaderSource = (shaderHelper: ShaderHelper) => { + const output = outputVariable('qkv_with_bias', qkv.dataType, outputShape); + const qkvInput = inputVariable('qkv', qkv.dataType, outputShape); + const biasInput = inputVariable('bias', bias.dataType, outputShape); + + const uniforms: UniformsArrayType = [ + {name: 'output_size', type: 'u32'}, {name: 'bias_offset', type: 'u32'}, {name: 'hidden_size', type: 'u32'} + ]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(qkvInput, biasInput, output)} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)} - let biasOffsetIdx = (global_idx % hiddenSize) + biasOffset; + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} + let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset; - qkv_with_bias[global_idx] = qkv[global_idx] + bias[biasOffsetIdx]; + qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx]; }`; + }; return context.compute( { name: 'MultiHeadAttentionAddBias', - shaderCache: {hint: JSON.stringify({batchSize, sequenceLength, hiddenSize, biasOffset})}, + shaderCache: {inputDependencies: ['type', 'type']}, getRunData: () => ({ outputs: [{dims: outputShape, dataType: qkv.dataType, gpuDataType: GpuDataType.default}], dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, + programUniforms }), getShaderSource, }, diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts index 18859e253aa02..eca3fa7d944bb 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts @@ -1,15 +1,14 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {DataType} from '../../../wasm-common'; +import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common'; import {TensorView} from '../../tensor-view'; import {ShapeUtil} from '../../util'; -import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key'; -import {ComputeContext, ProgramInfo} from '../types'; +import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types'; -import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common'; +import {createTensorShapeVariables, getElementAt, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common'; -export interface PadAttributes extends AttributeWithCacheKey { +interface PadAttributes { // 0-constant, 1-reflect, 2-edge, 3-wrap readonly mode: number; readonly value: number; @@ -35,27 +34,23 @@ const validateInputs = (inputs: readonly TensorView[]): void => { } }; -const getPadConstant = - (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[], - dataType: string, constantValue: number): string => { - const inputRank = inputDims.length; - - let block = ''; - for (let i = inputRank - 1; i >= 0; --i) { - block += ` - k = i32(${output.indicesGet('indices', i)}) - ${pads[i]}; +const getPadConstant = (output: IndicesHelper, inputRank: number, padsLength: number): string => { + let block = ''; + for (let i = inputRank - 1; i >= 0; --i) { + block += ` + k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)}; if (k < 0) { break; } - if (k >= ${inputDims[i]}) { + if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) { break; } - offset += k * ${inputStrides[i]}; + offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)}); `; - } + } - return ` - value = ${dataType}(${constantValue}); + return ` + value = ${output.type.value}(uniforms.constant_value); for (var i = 0; i < 1; i++) { var offset = 0; var k = 0; @@ -63,143 +58,143 @@ const getPadConstant = value = x[offset]; } `; - }; - -const getPadReflect = - (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => { - const inputRank = inputDims.length; +}; - let block = ''; - for (let i = inputRank - 1; i >= 0; --i) { - block += ` - k = i32(${output.indicesGet('indices', i)}) - ${pads[i]}; +const getPadReflect = (output: IndicesHelper, inputRank: number, padsLength: number): string => { + let block = ''; + for (let i = inputRank - 1; i >= 0; --i) { + block += ` + k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)}; if (k < 0) { k = -k; } { - let _2n_1 = ${2 * (inputDims[i] - 1)}; + let _2n_1 = 2 * (i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1); k = k % _2n_1; - if(k >= ${inputDims[i]}) { + if(k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) { k = _2n_1 - k; } } - offset += k * ${inputStrides[i]}; + offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)}); `; - } + } - return ` + return ` var offset = 0; var k = 0; ${block} value = x[offset]; `; - }; - -const getPadEdge = - (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => { - const inputRank = inputDims.length; +}; - let block = ''; - for (let i = inputRank - 1; i >= 0; --i) { - block += ` - k = i32(${output.indicesGet('indices', i)}) - ${pads[i]}; +const getPadEdge = (output: IndicesHelper, inputRank: number, padsLength: number): string => { + let block = ''; + for (let i = inputRank - 1; i >= 0; --i) { + block += ` + k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)}; if (k < 0) { k = 0; } - if (k >= ${inputDims[i]}) { - k = ${inputDims[i] - 1}; + if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) { + k = i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1; } - offset += k * ${inputStrides[i]}; + offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)}); `; - } + } - return ` + return ` var offset = 0; var k = 0; ${block} value = x[offset]; `; - }; - -const getPadWrap = - (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => { - const inputRank = inputDims.length; +}; - let block = ''; - for (let i = inputRank - 1; i >= 0; --i) { - block += ` - k = i32(${output.indicesGet('indices', i)}) - ${pads[i]}; +const getPadWrap = (output: IndicesHelper, inputRank: number, padsLength: number): string => { + let block = ''; + for (let i = inputRank - 1; i >= 0; --i) { + block += ` + k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)}; if (k < 0) { - k += ${inputDims[i]}; + k += i32(${getElementAt('uniforms.x_shape', i, inputRank)}]); } - if (k >= ${inputDims[i]}) { - k -= ${inputDims[i]}; + if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) { + k -= i32(${getElementAt('uniforms.x_shape', i, inputRank)}); } - offset += k * ${inputStrides[i]}; + offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)}); `; - } + } - return ` + return ` var offset = 0; var k = 0; ${block} value = x[offset]; `; - }; - -const getPadSnippet = - (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], attributes: PadAttributes, - dataType: string): string => { - switch (attributes.mode) { - case 0: - return getPadConstant(output, inputDims, inputStrides, attributes.pads, dataType, attributes.value); - case 1: - return getPadReflect(output, inputDims, inputStrides, attributes.pads); - case 2: - return getPadEdge(output, inputDims, inputStrides, attributes.pads); - case 3: - return getPadWrap(output, inputDims, inputStrides, attributes.pads); - default: - throw new Error('Invalid mode'); - } - }; - -const generatePadCode = - (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string): - string => { - const inputDims = inputs[0].dims; - const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads); - const outputSize = ShapeUtil.size(outputDims); - const inputStrides = ShapeUtil.computeStrides(inputDims); - - const output = outputVariable('output', inputs[0].dataType, outputDims); - const input = inputVariable('x', inputs[0].dataType, inputDims); - - const padSnippet = getPadSnippet(output, inputDims, inputStrides, attributes, dataType); - const padCode = ` - ${shaderHelper.declareVariables(input, output)} - ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)} - - let indices = ${output.offsetToIndices('global_idx')}; - - var value = ${dataType}(0); - ${padSnippet} - output[global_idx] = value; - }`; - return padCode; - }; +}; + +const getPadSnippet = (output: IndicesHelper, inputRank: number, attributes: PadAttributes): string => { + switch (attributes.mode) { + case 0: + return getPadConstant(output, inputRank, attributes.pads.length); + case 1: + return getPadReflect(output, inputRank, attributes.pads.length); + case 2: + return getPadEdge(output, inputRank, attributes.pads.length); + case 3: + return getPadWrap(output, inputRank, attributes.pads.length); + default: + throw new Error('Invalid mode'); + } +}; const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfo => { const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads); + const inputDims = inputs[0].dims; + const outputSize = ShapeUtil.size(outputShape); + const programUniforms: ProgramUniform[] = + [{type: 'uint32', data: outputSize}, {type: 'uint32', data: attributes.pads}]; + if (attributes.mode === 0) { + const tensorDataType = tensorDataTypeEnumToString(inputs[0].dataType) as ProgramUniform['type']; + programUniforms.push({type: tensorDataType, data: attributes.value}); + } + + programUniforms.push(...createTensorShapeVariables(inputs[0].dims), ...createTensorShapeVariables(outputShape)); + const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank']; + + const getShaderSource = (shaderHelper: ShaderHelper) => { + const output = outputVariable('output', inputs[0].dataType, outputShape.length); + const input = inputVariable('x', inputs[0].dataType, inputDims.length); + const dataType = input.type.value; + const padSnippet = getPadSnippet(output, inputDims.length, attributes); + const uniforms: UniformsArrayType = + [{name: 'output_size', type: 'u32'}, {name: 'pads', type: 'i32', length: attributes.pads.length}]; + if (attributes.mode === 0) { + uniforms.push({name: 'constant_value', type: dataType as UniformDataElementType}); + } + + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(input, output)} + ${shaderHelper.mainStart()} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} + + let indices = ${output.offsetToIndices('global_idx')}; + + var value = ${dataType}(0); + ${padSnippet} + output[global_idx] = value; + }`; + }; + return { name: 'Pad', - shaderCache: {hint: attributes.cacheKey}, + shaderCache: {hint: `${attributes.mode}`, inputDependencies}, getRunData: () => ({ outputs: [{dims: outputShape, dataType: inputs[0].dataType}], - dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)} + dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)}, + programUniforms }), - getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'), + getShaderSource, }; }; @@ -223,7 +218,7 @@ const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes const pads: number[] = []; updatePads.forEach(v => pads.push(v)); - return createAttributeWithCacheKey({mode: attributes.mode, value, pads}); + return {mode: attributes.mode, value, pads}; } else { return attributes; } @@ -234,10 +229,3 @@ export const pad = (context: ComputeContext, attributes: PadAttributes): void => const updatedAttributes = createPadAttributesFromInputs(context.inputs, attributes); context.compute(createPadProgramInfo(context.inputs, updatedAttributes), {inputs: [0]}); }; - -export const parsePadAttributes = (attributes: Record): PadAttributes => { - const mode = attributes.mode as number; - const value = attributes.value as number; - const pads = attributes.pads as number[]; - return createAttributeWithCacheKey({mode, value, pads}); -}; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/range.ts b/js/web/lib/wasm/jsep/webgpu/ops/range.ts index 9cf66111bf707..ed04b0f94bc57 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/range.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/range.ts @@ -3,10 +3,10 @@ import {env} from 'onnxruntime-common'; -import {DataType} from '../../../wasm-common'; -import {ComputeContext, ProgramInfo} from '../types'; +import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common'; +import {ComputeContext, ProgramInfo, ProgramUniform} from '../types'; -import {outputVariable, ShaderHelper} from './common'; +import {createTensorShapeVariables, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common'; const validateInputsContent = (start: number, limit: number, delta: number): void => { const sameStartLimit = start === limit; @@ -22,23 +22,36 @@ const createRangeProgramInfo = (start: number, limit: number, delta: number, dat const numElements = Math.abs(Math.ceil((limit - start) / delta)); const outputShape: number[] = [numElements]; const outputSize = numElements; + const tensorDataType = tensorDataTypeEnumToString(dataType) as ProgramUniform['type']; + const programUniforms: ProgramUniform[] = [ + {type: 'uint32', data: outputSize}, {type: tensorDataType, data: start}, {type: tensorDataType, data: delta}, + ...createTensorShapeVariables(outputShape) + ]; - const output = outputVariable('output', dataType, outputShape); - const wgslType = output.type.storage; - - const getShaderSource = (shaderHelper: ShaderHelper) => ` - ${shaderHelper.declareVariables(output)} + const getShaderSource = (shaderHelper: ShaderHelper) => { + const output = outputVariable('output', dataType, outputShape.length); + const wgslType = output.type.value; + const uniforms: UniformsArrayType = [ + {name: 'outputSize', type: 'u32'}, {name: 'start', type: wgslType as UniformDataElementType}, + {name: 'delta', type: wgslType as UniformDataElementType} + ]; + return ` + ${shaderHelper.registerUniforms(uniforms).declareVariables(output)} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)} - output[global_idx] = ${wgslType}(${start}) + ${wgslType}(global_idx) * ${wgslType}(${delta}); + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')} + output[global_idx] = uniforms.start + ${wgslType}(global_idx) * uniforms.delta; }`; + }; + return { name: 'Range', - shaderCache: {hint: [start, limit, delta].map(x => x.toString()).join('_')}, + shaderCache: {hint: `${dataType}`}, getShaderSource, - getRunData: () => ( - {outputs: [{dims: outputShape, dataType}], - dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}}) + getRunData: () => ({ + outputs: [{dims: outputShape, dataType}], + dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}, + programUniforms + }) }; }; diff --git a/js/web/script/build.ts b/js/web/script/build.ts index 5151f27582c1f..a52ac4454a5c1 100644 --- a/js/web/script/build.ts +++ b/js/web/script/build.ts @@ -44,6 +44,7 @@ const SOURCE_ROOT_FOLDER = path.join(__dirname, '../..'); // /js/ const DEFAULT_DEFINE = { 'BUILD_DEFS.DISABLE_WEBGL': 'false', 'BUILD_DEFS.DISABLE_WEBGPU': 'false', + 'BUILD_DEFS.DISABLE_WEBNN': 'false', 'BUILD_DEFS.DISABLE_WASM': 'false', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'false', 'BUILD_DEFS.DISABLE_WASM_THREAD': 'false', @@ -359,6 +360,7 @@ async function main() { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true', + 'BUILD_DEFS.DISABLE_WEBNN': 'true', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'true', 'BUILD_DEFS.DISABLE_WASM_THREAD': 'true', }, @@ -367,10 +369,7 @@ async function main() { if (BUNDLE_MODE === 'dev') { // ort.all.js - await addBuildTask(buildOrt({ - outputBundleName: 'ort.all', - format: 'iife', - })); + await addBuildTask(buildOrt({outputBundleName: 'ort.all', format: 'iife', define: {...DEFAULT_DEFINE}})); } if (BUNDLE_MODE === 'perf') { @@ -394,7 +393,7 @@ async function main() { // ort.webgpu[.min].js await addAllWebBuildTasks({ outputBundleName: 'ort.webgpu', - define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'}, + define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_WEBNN': 'true'}, }); // ort.wasm[.min].js await addAllWebBuildTasks({ @@ -404,7 +403,12 @@ async function main() { // ort.webgl[.min].js await addAllWebBuildTasks({ outputBundleName: 'ort.webgl', - define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WASM': 'true'}, + define: { + ...DEFAULT_DEFINE, + 'BUILD_DEFS.DISABLE_WEBGPU': 'true', + 'BUILD_DEFS.DISABLE_WASM': 'true', + 'BUILD_DEFS.DISABLE_WEBNN': 'true', + }, }); // ort.wasm-core[.min].js await addAllWebBuildTasks({ @@ -413,6 +417,7 @@ async function main() { ...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true', + 'BUILD_DEFS.DISABLE_WEBNN': 'true', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'true', 'BUILD_DEFS.DISABLE_WASM_THREAD': 'true', }, @@ -425,6 +430,7 @@ async function main() { 'BUILD_DEFS.DISABLE_TRAINING': 'false', 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WEBGL': 'true', + 'BUILD_DEFS.DISABLE_WEBNN': 'true', }, }); } diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts index ee955ec8d4f17..fc74adfed1fee 100644 --- a/js/web/script/test-runner-cli-args.ts +++ b/js/web/script/test-runner-cli-args.ts @@ -79,6 +79,7 @@ Options: --webgl-texture-cache-mode Set the WebGL texture cache mode (initializerOnly/full) --webgl-texture-pack-mode Set the WebGL texture pack mode (true/false) --webgpu-profiling-mode Set the WebGPU profiling mode (off/default) + --webnn-device-type Set the WebNN device type (cpu/gpu) *** Browser Options *** @@ -174,6 +175,7 @@ export interface TestRunnerCliArgs { cudaFlags?: Record; wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption; webglOptions?: InferenceSession.WebGLExecutionProviderOption; + webnnOptions?: InferenceSession.WebNNExecutionProviderOption; globalEnvFlags?: Test.Options['globalEnvFlags']; noSandbox?: boolean; chromiumFlags: string[]; @@ -335,6 +337,14 @@ function parseWebgpuFlags(args: minimist.ParsedArgs): Partial { return {profilingMode, validateInputContent}; } +function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption { + const deviceType = args['webnn-device-type']; + if (deviceType !== undefined && deviceType !== 'cpu' && deviceType !== 'gpu') { + throw new Error('Flag "webnn-device-type" is invalid'); + } + return {name: 'webnn', deviceType}; +} + function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable { const wasm = parseWasmFlags(args); const webgl = parseWebglFlags(args); @@ -449,6 +459,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs const wasmOptions = parseWasmOptions(args); const webglOptions = parseWebglOptions(args); + const webnnOptions = parseWebNNOptions(args); // Option: --no-sandbox const noSandbox = !!args['no-sandbox']; @@ -487,6 +498,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs fileCache, cpuOptions, webglOptions, + webnnOptions, wasmOptions, globalEnvFlags, noSandbox, diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts index 74a03290332a8..d56792c6e3595 100644 --- a/js/web/script/test-runner-cli.ts +++ b/js/web/script/test-runner-cli.ts @@ -165,6 +165,7 @@ async function main() { debug: args.debug, cpuOptions: args.cpuOptions, webglOptions: args.webglOptions, + webnnOptions: args.webnnOptions, wasmOptions: args.wasmOptions, globalEnvFlags: args.globalEnvFlags } @@ -499,7 +500,7 @@ async function main() { args.bundleMode === 'perf' ? 'perf' : args.debug ? 'debug' : 'test', - webgpu, webnn); + webgpu); const karmaArgs = ['karma', 'start', `--browsers ${browser}`]; const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags]; if (args.debug) { @@ -614,11 +615,10 @@ async function main() { fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config); } - function getBrowserNameFromEnv( - env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) { + function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) { switch (env) { case 'chrome': - return selectChromeBrowser(mode, webgpu, webnn); + return selectChromeBrowser(mode, webgpu); case 'edge': return 'EdgeTest'; case 'firefox': @@ -634,10 +634,8 @@ async function main() { } } - function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) { - if (webnn) { - return 'ChromeCanaryTest'; - } else if (webgpu) { + function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) { + if (webgpu) { return 'ChromeTest'; } else { switch (mode) { diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc index 2e8eaaba191d0..cc10df5864233 100644 --- a/js/web/test/data/ops/conv.jsonc +++ b/js/web/test/data/ops/conv.jsonc @@ -298,7 +298,157 @@ } ] }, - + { + "name": "conv - vectorize group - A", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [1, 1], "type": "ints" }, + { "name": "group", "data": 2, "type": "int" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0], + "dims": [1, 2, 3, 3], + "type": "float32" + }, + { + "data": [1.0, 2.0], + "dims": [2, 1, 1, 1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0], + "dims": [1, 2, 3, 3], + "type": "float32" + } + ] + } + ] + }, + { + "name": "conv - vectorize group - B", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0 + ], + "dims": [1, 3, 3, 3], + "type": "float32" + }, + { + "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], + "dims": [3, 1, 2, 2], + "type": "float32" + }, + { + "data": [0.1, 0.2, 0.3], + "dims": [3], + "type": "float32" + } + ], + "outputs": [ + { + "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3], + "dims": [1, 3, 2, 2], + "type": "float32" + } + ] + } + ] + }, + { + "name": "conv - vectorize group - C", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0 + ], + "dims": [1, 3, 3, 4], + "type": "float32" + }, + { + "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], + "dims": [3, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374], + "dims": [1, 3, 2, 3], + "type": "float32" + } + ] + } + ] + }, + { + "name": "conv - vectorize group - D", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" }, + { "name": "strides", "data": [2, 2], "type": "ints" } + ], + "cases": [ + { + "name": "T[0] strides = [2, 2]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0 + ], + "dims": [1, 3, 3, 4], + "type": "float32" + }, + { + "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], + "dims": [3, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [34, 54, 386, 438, 1122, 1206], + "dims": [1, 3, 1, 2], + "type": "float32" + } + ] + } + ] + }, { "name": "conv - pointwise", "operator": "Conv", diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc index 6a4e6912405ee..e89ac2da3795f 100644 --- a/js/web/test/data/ops/instance-norm.jsonc +++ b/js/web/test/data/ops/instance-norm.jsonc @@ -38,6 +38,79 @@ } ] }, + { + "name": "Simple test with NHWC, components 1", + "operator": "InstanceNormalization", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5], + "dims": [1, 5, 3, 1], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5], + "dims": [5], + "type": "float32" + }, + { + "data": [4, 5, 6, 7, 8], + "dims": [5], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6, + 9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539 + ], + "dims": [1, 5, 3, 1], + "type": "float32" + } + ] + } + ] + }, + { + "name": "Simple test with NHWC, components 2", + "operator": "InstanceNormalization", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8], + "dims": [2, 6, 1, 1], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6], + "dims": [6], + "type": "float32" + }, + { + "data": [4, 5, 6, 7, 8, 9], + "dims": [6], + "type": "float32" + } + ], + "outputs": [ + { + "data": [4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9], + "dims": [2, 6, 1, 1], + "type": "float32" + } + ] + } + ] + }, { "name": "Simple test with NCHW", "operator": "InstanceNormalization", @@ -75,5 +148,81 @@ ] } ] + }, + { + "name": "Simple test with NCHW, components 1", + "operator": "InstanceNormalization", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5], + "dims": [1, 5, 3, 1], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5], + "dims": [5], + "type": "float32" + }, + { + "data": [4, 5, 6, 7, 8], + "dims": [5], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6, + 9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539 + ], + "dims": [1, 5, 3, 1], + "type": "float32" + } + ] + } + ] + }, + { + "name": "Simple test with NCHW, components 2", + "operator": "InstanceNormalization", + "opset": { "domain": "", "version": 17 }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2], + "dims": [1, 3, 6, 1], + "type": "float32" + }, + { + "data": [1, 2, 3], + "dims": [3], + "type": "float32" + }, + { + "data": [4, 5, 6], + "dims": [3], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.5361523628234863, 3.1216912269592285, 3.70723032951355, 4.292769432067871, 4.878308296203613, + 5.4638471603393555, 1.8666191101074219, 3.9555397033691406, 6.044460296630859, 8.133380889892578, + 6.044460296630859, 3.9555397033691406, 10.3915433883667, 8.634925842285156, 6.878308296203613, + 5.121691703796387, 3.365074634552002, 1.6084575653076172 + ], + "dims": [1, 3, 6, 1], + "type": "float32" + } + ] + } + ] } ] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 594ce9feed31e..79f42e36bf390 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1501,85 +1501,424 @@ "webnn": { "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"], "node": [ - // Check in node tests that have native Wasm implementations. - // (i.e.) not tests that rely on the fallback cpu implementations. - // Use the 'cpu' level of node tests to test those implementations. + "test_abs", + "test_acos_example", + "test_acos", + "test_acosh_example", + "test_acosh", + // // "test_adagrad_multiple", + // // "test_adagrad", + // // "test_adam_multiple", + // // "test_adam", "test_add_bcast", + // "test_add_uint8", "test_add", - "test_sub_bcast", - "test_sub_example", - "test_sub", - "test_mul_bcast", - "test_mul_example", - "test_mul", - "test_div_bcast", - "test_div_example", - "test_div", - "test_xor_bcast3v1d", - "test_xor_bcast3v2d", - "test_xor_bcast4v2d", - "test_xor_bcast4v3d", - "test_xor_bcast4v4d", - "test_xor2d", - "test_xor3d", - "test_xor4d", - "test_or_bcast3v1d", - "test_or_bcast3v2d", - "test_or_bcast4v2d", - "test_or_bcast4v3d", - "test_or_bcast4v4d", - "test_and_bcast3v1d", - "test_and_bcast3v2d", - "test_and_bcast4v2d", - "test_and_bcast4v3d", - "test_and_bcast4v4d", - "test_and2d", - "test_and3d", - "test_and4d", - "test_prelu_broadcast", - "test_prelu_example", + // "test_and_bcast3v1d", + // "test_and_bcast3v2d", + // "test_and_bcast4v2d", + // "test_and_bcast4v3d", + // "test_and_bcast4v4d", + // "test_and2d", + // "test_and3d", + // "test_and4d", + // "test_argmax_default_axis_example_select_last_index", + // "test_argmax_default_axis_example", + // "test_argmax_default_axis_random_select_last_index", + // "test_argmax_default_axis_random", + // "test_argmax_keepdims_example_select_last_index", + // "test_argmax_keepdims_example", + // "test_argmax_keepdims_random_select_last_index", + // "test_argmax_keepdims_random", + // "test_argmax_negative_axis_keepdims_example_select_last_index", + // "test_argmax_negative_axis_keepdims_example", + // "test_argmax_negative_axis_keepdims_random_select_last_index", + // "test_argmax_negative_axis_keepdims_random", + // "test_argmax_no_keepdims_example_select_last_index", + // "test_argmax_no_keepdims_example", + // "test_argmax_no_keepdims_random_select_last_index", + // "test_argmax_no_keepdims_random", + // "test_argmin_default_axis_example_select_last_index", + // "test_argmin_default_axis_example", + // "test_argmin_default_axis_random_select_last_index", + // "test_argmin_default_axis_random", + // "test_argmin_keepdims_example_select_last_index", + // "test_argmin_keepdims_example", + // "test_argmin_keepdims_random_select_last_index", + // "test_argmin_keepdims_random", + // "test_argmin_negative_axis_keepdims_example_select_last_index", + // "test_argmin_negative_axis_keepdims_example", + // "test_argmin_negative_axis_keepdims_random_select_last_index", + // "test_argmin_negative_axis_keepdims_random", + // "test_argmin_no_keepdims_example_select_last_index", + // "test_argmin_no_keepdims_example", + // "test_argmin_no_keepdims_random_select_last_index", + // "test_argmin_no_keepdims_random", + // "test_asin_example", + // "test_asin", + // "test_asinh_example", + // "test_asinh", + // "test_atan_example", + // "test_atan", + // "test_atanh_example", + // "test_atanh", + // "test_averagepool_1d_default", + // "test_averagepool_2d_ceil", + "test_averagepool_2d_default", + "test_averagepool_2d_pads_count_include_pad", + "test_averagepool_2d_pads", + "test_averagepool_2d_precomputed_pads_count_include_pad", + "test_averagepool_2d_precomputed_pads", + "test_averagepool_2d_precomputed_same_upper", + "test_averagepool_2d_precomputed_strides", + "test_averagepool_2d_same_lower", + "test_averagepool_2d_same_upper", + "test_averagepool_2d_strides", + // "test_averagepool_3d_default", "test_basic_conv_with_padding", "test_basic_conv_without_padding", - "test_batchnorm_epsilon", - "test_batchnorm_example", - "opset{10,11,12}/test_cast_STRING_to_FLOAT", - "test_clip_splitbounds", - "test_clip_outbounds", - "test_clip_inbounds", - "test_clip_example", - "test_clip_default_min", - "test_clip_default_max", + // "test_basic_convinteger", + // "test_batchnorm_epsilon_training_mode", + // "test_batchnorm_epsilon", + // "test_batchnorm_example_training_mode", + // "test_batchnorm_example", + // // "test_bernoulli_double_expanded", + // // "test_bernoulli_double", + // // "test_bernoulli_expanded", + // // "test_bernoulli_seed_expanded", + // // "test_bernoulli_seed", + // // "test_bernoulli", + // // "test_bitshift_left_uint16", + // // "test_bitshift_left_uint32", + // // "test_bitshift_left_uint64", + // // "test_bitshift_left_uint8", + // // "test_bitshift_right_uint16", + // // "test_bitshift_right_uint32", + // // "test_bitshift_right_uint64", + // // "test_bitshift_right_uint8", + // // "test_blackmanwindow_expanded", + // // "test_blackmanwindow_symmetric_expanded", + // // "test_blackmanwindow_symmetric", + // // "test_blackmanwindow", + // // "test_cast_BFLOAT16_to_FLOAT", + // // "test_cast_DOUBLE_to_FLOAT", + // // "test_cast_DOUBLE_to_FLOAT16", + // // "test_cast_FLOAT_to_BFLOAT16", + // // "test_cast_FLOAT_to_DOUBLE", + // // "test_cast_FLOAT_to_FLOAT16", + // // "test_cast_FLOAT_to_STRING", + // // "test_cast_FLOAT16_to_DOUBLE", + // // "test_cast_FLOAT16_to_FLOAT", + // // "test_cast_STRING_to_FLOAT", + // // "test_castlike_BFLOAT16_to_FLOAT_expanded", + // // "test_castlike_BFLOAT16_to_FLOAT", + // // "test_castlike_DOUBLE_to_FLOAT_expanded", + // // "test_castlike_DOUBLE_to_FLOAT", + // // "test_castlike_DOUBLE_to_FLOAT16_expanded", + // // "test_castlike_DOUBLE_to_FLOAT16", + // // "test_castlike_FLOAT_to_BFLOAT16_expanded", + // // "test_castlike_FLOAT_to_BFLOAT16", + // // "test_castlike_FLOAT_to_DOUBLE_expanded", + // // "test_castlike_FLOAT_to_DOUBLE", + // // "test_castlike_FLOAT_to_FLOAT16_expanded", + // // "test_castlike_FLOAT_to_FLOAT16", + // // "test_castlike_FLOAT_to_STRING_expanded", + // // "test_castlike_FLOAT_to_STRING", + // // "test_castlike_FLOAT16_to_DOUBLE_expanded", + // // "test_castlike_FLOAT16_to_DOUBLE", + // // "test_castlike_FLOAT16_to_FLOAT_expanded", + // // "test_castlike_FLOAT16_to_FLOAT", + // // "test_castlike_STRING_to_FLOAT_expanded", + // // "test_castlike_STRING_to_FLOAT", + "test_ceil_example", + "test_ceil", + // "test_celu_expanded", + // "test_celu", "test_clip_default_inbounds", + "test_clip_default_int8_inbounds", + "test_clip_default_int8_max", + "test_clip_default_int8_min", + "test_clip_default_max", + "test_clip_default_min", + "test_clip_example", + "test_clip_inbounds", + "test_clip_outbounds", + "test_clip_splitbounds", "test_clip", + // // "test_compress_0", + // // "test_compress_1", + // // "test_compress_default_axis", + // // "test_compress_negative_axis", + "test_concat_1d_axis_0", + "test_concat_1d_axis_negative_1", + "test_concat_2d_axis_0", + "test_concat_2d_axis_1", + "test_concat_2d_axis_negative_1", + "test_concat_2d_axis_negative_2", + "test_concat_3d_axis_0", + "test_concat_3d_axis_1", + "test_concat_3d_axis_2", + "test_concat_3d_axis_negative_1", + "test_concat_3d_axis_negative_2", + "test_concat_3d_axis_negative_3", + "test_conv_with_autopad_same", "test_conv_with_strides_and_asymmetric_padding", "test_conv_with_strides_no_padding", "test_conv_with_strides_padding", - "test_gemm_nobroadcast", + // // "test_convinteger_with_padding", + // // "test_convinteger_without_padding", + // "test_convtranspose_1d", + // // "test_convtranspose_3d", + // "test_convtranspose_autopad_same", + "test_convtranspose_dilations", + "test_convtranspose_kernel_shape", + "opset{9,17}/test_convtranspose_output_shape", + "test_convtranspose_pad", + "test_convtranspose_pads", + "test_convtranspose_with_kernel", + "test_convtranspose", + "test_cos_example", + "test_cos", + // "test_cosh_example", + // "test_cosh", + // "test_cumsum_1d_exclusive", + // "test_cumsum_1d_reverse_exclusive", + // "test_cumsum_1d_reverse", + // "test_cumsum_1d", + // "test_cumsum_2d_axis_0", + // "test_cumsum_2d_axis_1", + // "test_cumsum_2d_negative_axis", + // "test_depthtospace_crd_mode_example", + // "test_depthtospace_crd_mode", + // "test_depthtospace_dcr_mode", + // "test_depthtospace_example", + // "test_depthtospace", + // // "test_dequantizelinear_axis", + // // "test_dequantizelinear", + // // "test_det_2d", + // // "test_det_nd", + // // "test_dft_axis", + // // "test_dft_inverse", + // // "test_dft", + "test_div_bcast", + "test_div_example", + // "test_div_uint8", + "test_div", + // // "test_dropout_default_mask_ratio", + // // "test_dropout_default_mask", + // // "test_dropout_default_old", + // // "test_dropout_default_ratio", + // // "test_dropout_default", + // // "test_dropout_random_old", + // // "test_dropout_random", + // // "test_dynamic_slice_default_axes", + // // "test_dynamic_slice_end_out_of_bounds", + // // "test_dynamic_slice_neg", + // // "test_dynamic_slice_start_out_of_bounds", + // // "test_dynamic_slice", + // // "test_dynamicquantizelinear_expanded", + // // "test_dynamicquantizelinear_max_adjusted_expanded", + // // "test_dynamicquantizelinear_max_adjusted", + // // "test_dynamicquantizelinear_min_adjusted_expanded", + // // "test_dynamicquantizelinear_min_adjusted", + // // "test_dynamicquantizelinear", + // "test_edge_pad", + // "test_einsum_batch_diagonal", + // "test_einsum_batch_matmul", + // "test_einsum_inner_prod", + // "test_einsum_sum", + // "test_einsum_transpose", + "test_elu_default", + "test_elu_example", + "test_elu", + // "test_equal_bcast", + // "test_equal", + // "test_erf", + "test_exp_example", + "test_exp", + // "test_expand_dim_changed", + // "test_expand_dim_unchanged", + // "test_eyelike_populate_off_main_diagonal", + // "test_eyelike_with_dtype", + // "test_eyelike_without_dtype", + "test_flatten_axis0", + "test_flatten_axis1", + "test_flatten_axis2", + "test_flatten_axis3", + "test_flatten_default_axis", + "test_flatten_negative_axis1", + "test_flatten_negative_axis2", + "test_flatten_negative_axis3", + "test_flatten_negative_axis4", + "test_floor_example", + "test_floor", + // "test_gather_0", + // "test_gather_1", + // "test_gather_2d_indices", + // "test_gather_negative_indices", + // "test_gather_elements_0", + // "test_gather_elements_1", + // "test_gather_elements_negative_indices", + // "test_gather_negative_indices", + // "test_gathernd_example_float32", + // "test_gathernd_example_int32_batch_dim1", + // "test_gathernd_example_int32", + "test_gemm_all_attributes", + "test_gemm_alpha", + "test_gemm_beta", "test_gemm_broadcast", - "test_matmul_2d", - "test_matmul_3d", - "test_matmul_4d", - "test_softmax_axis_0", - "test_softmax_axis_1", - "test_softmax_axis_2", - "test_softmax_default_axis", - "test_softmax_example", - "test_softmax_large_number", - "test_sum_example", - "test_sum_one_input", - "test_sum_two_inputs", - "test_averagepool_1d_default", - "test_averagepool_2d_default", - "test_averagepool_2d_pads", - "test_averagepool_2d_precomputed_pads", - "test_averagepool_2d_precomputed_same_upper", - "test_averagepool_2d_precomputed_strides", - "test_averagepool_2d_same_upper", - "test_averagepool_2d_same_lower", - "test_averagepool_2d_strides", - "test_averagepool_3d_default", - "test_maxpool_1d_default", + "test_gemm_default_matrix_bias", + "test_gemm_default_no_bias", + // "test_gemm_default_scalar_bias", + "test_gemm_default_single_elem_vector_bias", + "test_gemm_default_vector_bias", + "test_gemm_default_zero_bias", + "test_gemm_nobroadcast", + "test_gemm_transposeA", + "test_gemm_transposeB", + "test_globalaveragepool_precomputed", + "test_globalaveragepool", + // "test_globalmaxpool_precomputed", + // "test_globalmaxpool", + // "test_greater_bcast", + // "test_greater_equal_bcast_expanded", + // "test_greater_equal_bcast", + // "test_greater_equal_expanded", + // "test_greater_equal", + // "test_greater", + // // "test_gridsample_aligncorners_true", + // // "test_gridsample_bicubic", + // // "test_gridsample_bilinear", + // // "test_gridsample_border_padding", + // // "test_gridsample_nearest", + // // "test_gridsample_reflection_padding", + // // "test_gridsample_zeros_padding", + // // "test_gridsample", + // // "test_gru_batchwise", + // // "test_gru_defaults", + // // "test_gru_seq_length", + // // "test_gru_with_initial_bias", + // // "test_hammingwindow_expanded", + // // "test_hammingwindow_symmetric_expanded", + // // "test_hammingwindow_symmetric", + // // "test_hammingwindow", + // // "test_hannwindow_expanded", + // // "test_hannwindow_symmetric_expanded", + // // "test_hannwindow_symmetric", + // // "test_hannwindow", + // // "test_hardmax_axis_0", + // // "test_hardmax_axis_1", + // // "test_hardmax_axis_2", + // // "test_hardmax_default_axis", + // // "test_hardmax_example", + // // "test_hardmax_negative_axis", + // // "test_hardmax_one_hot", + // // "test_hardsigmoid_default", + // // "test_hardsigmoid_example", + // // "test_hardsigmoid", + // "test_hardswish_expanded", + "test_hardswish", + // "test_if", + // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra + // supports Sequence and Optional types + // "test_if_seq", + // "test_if_opt", + "test_instancenorm_epsilon", + // "test_instancenorm_example", + // "test_isinf_negative", + // "test_isinf_positive", + // "test_isinf", + // "test_isnan", + // "test_layer_normalization_2d_axis_negative_1_expanded", + // "test_layer_normalization_2d_axis_negative_1", + // "test_layer_normalization_2d_axis_negative_2_expanded", + // "test_layer_normalization_2d_axis_negative_2", + // "test_layer_normalization_2d_axis0_expanded", + // "test_layer_normalization_2d_axis0", + // "test_layer_normalization_2d_axis1_expanded", + // "test_layer_normalization_2d_axis1", + // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded", + // "test_layer_normalization_3d_axis_negative_1_epsilon", + // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded", + // "test_layer_normalization_3d_axis_negative_2_epsilon", + // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded", + // "test_layer_normalization_3d_axis_negative_3_epsilon", + // // "test_layer_normalization_3d_axis0_epsilon_expanded", + // "test_layer_normalization_3d_axis0_epsilon", + // "test_layer_normalization_3d_axis1_epsilon_expanded", + // "test_layer_normalization_3d_axis1_epsilon", + // // "test_layer_normalization_3d_axis2_epsilon_expanded", + // "test_layer_normalization_3d_axis2_epsilon", + // "test_layer_normalization_4d_axis_negative_1_expanded", + // "test_layer_normalization_4d_axis_negative_1", + // // "test_layer_normalization_4d_axis_negative_2_expanded", + // "test_layer_normalization_4d_axis_negative_2", + // "test_layer_normalization_4d_axis_negative_3_expanded", + // "test_layer_normalization_4d_axis_negative_3", + // "test_layer_normalization_4d_axis_negative_4_expanded", + // "test_layer_normalization_4d_axis_negative_4", + // "test_layer_normalization_4d_axis0_expanded", + // "test_layer_normalization_4d_axis0", + // "test_layer_normalization_4d_axis1_expanded", + // "test_layer_normalization_4d_axis1", + // "test_layer_normalization_4d_axis2_expanded", + // "test_layer_normalization_4d_axis2", + // "test_layer_normalization_4d_axis3_expanded", + // "test_layer_normalization_4d_axis3", + // "test_layer_normalization_default_axis_expanded", + // "test_layer_normalization_default_axis", + "test_leakyrelu_default", + "test_leakyrelu_example", + "test_leakyrelu", + // "test_less_bcast", + // "test_less_equal_bcast_expanded", + // "test_less_equal_bcast", + // "test_less_equal_expanded", + // "test_less_equal", + // "test_less", + "test_log_example", + "test_log", + // // "test_logsoftmax_axis_0_expanded", + // // "test_logsoftmax_axis_0", + // // "test_logsoftmax_axis_1_expanded", + // // "test_logsoftmax_axis_1", + // // "test_logsoftmax_axis_2_expanded", + // // "test_logsoftmax_axis_2", + // // "test_logsoftmax_default_axis_expanded", + // // "test_logsoftmax_default_axis", + // // "test_logsoftmax_example_1_expanded", + // // "test_logsoftmax_example_1", + // // "test_logsoftmax_large_number_expanded", + // // "test_logsoftmax_large_number", + // // "test_logsoftmax_negative_axis_expanded", + // // "test_logsoftmax_negative_axis", + // "test_lrn_default", + // "test_lrn", + // // "test_lstm_batchwise", + // // "test_lstm_defaults", + // // "test_lstm_with_initial_bias", + // // "test_lstm_with_peepholes", + // "test_matmul_2d", + // "test_matmul_3d", + // "test_matmul_4d", + // // "test_matmulinteger", + // "test_max_example", + // "test_max_float16", + // "test_max_float32", + // "test_max_float64", + // "test_max_int16", + // "test_max_int32", + // "test_max_int64", + // "test_max_int8", + // "test_max_one_input", + // "test_max_two_inputs", + // "test_max_uint16", + // "test_max_uint32", + // "test_max_uint64", + // "test_max_uint8", + // "test_maxpool_1d_default", + // "test_maxpool_2d_ceil", "test_maxpool_2d_default", + // "test_maxpool_2d_dilations", "test_maxpool_2d_pads", "test_maxpool_2d_precomputed_pads", "test_maxpool_2d_precomputed_same_upper", @@ -1587,13 +1926,622 @@ "test_maxpool_2d_same_lower", "test_maxpool_2d_same_upper", "test_maxpool_2d_strides", - "test_maxpool_3d_default", - "test_globalaveragepool_precomputed", - "test_globalaveragepool", - "test_globalmaxpool_precomputed", - "test_globalmaxpool", - "test_instancenorm_epsilon", - "test_instancenorm_example" + // "test_maxpool_2d_uint8", + // "test_maxpool_3d_default", + // "test_maxpool_with_argmax_2d_precomputed_pads", + // "test_maxpool_with_argmax_2d_precomputed_strides", + // // "test_maxunpool_export_with_output_shape", + // // "test_maxunpool_export_without_output_shape", + // // "test_mean_example", + // // "test_mean_one_input", + // // "test_mean_two_inputs", + // // "test_melweightmatrix", + // "test_min_example", + // "test_min_float16", + // "test_min_float32", + // "test_min_float64", + // "test_min_int16", + // "test_min_int32", + // "test_min_int64", + // "test_min_int8", + // "test_min_one_input", + // "test_min_two_inputs", + // "test_min_uint16", + // "test_min_uint32", + // "test_min_uint64", + // "test_min_uint8", + // "test_mod_bcast", + // "test_mod_broadcast", + // "test_mod_float_mixed_sign_example", + // "test_mod_fmod_mixed_sign_example", + // "test_mod_int64_fmod", + // "test_mod_int64_mixed_sign_example", + // "test_mod_mixed_sign_float16", + // "test_mod_mixed_sign_float32", + // "test_mod_mixed_sign_float64", + // "test_mod_mixed_sign_int16", + // "test_mod_mixed_sign_int32", + // "test_mod_mixed_sign_int64", + // "test_mod_mixed_sign_int8", + // "test_mod_uint16", + // "test_mod_uint32", + // "test_mod_uint64", + // "test_mod_uint8", + // // "test_momentum_multiple", + // // "test_momentum", + "test_mul_bcast", + "test_mul_example", + // "test_mul_uint8", + "test_mul", + // "test_mvn_expanded", + // "test_mvn", + "test_neg_example", + "test_neg", + // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NC", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded", + // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight", + // // "test_nesterov_momentum", + // // "test_nllloss_NC_expanded", + // // "test_nllloss_NC", + // // "test_nllloss_NCd1_expanded", + // // "test_nllloss_NCd1_ii_expanded", + // // "test_nllloss_NCd1_ii", + // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded", + // // "test_nllloss_NCd1_mean_weight_negative_ii", + // // "test_nllloss_NCd1_weight_expanded", + // // "test_nllloss_NCd1_weight_ii_expanded", + // // "test_nllloss_NCd1_weight_ii", + // // "test_nllloss_NCd1_weight", + // // "test_nllloss_NCd1", + // // "test_nllloss_NCd1d2_expanded", + // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded", + // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii", + // // "test_nllloss_NCd1d2_reduction_mean_expanded", + // // "test_nllloss_NCd1d2_reduction_mean", + // // "test_nllloss_NCd1d2_reduction_sum_expanded", + // // "test_nllloss_NCd1d2_reduction_sum", + // // "test_nllloss_NCd1d2_with_weight_expanded", + // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded", + // // "test_nllloss_NCd1d2_with_weight_reduction_mean", + // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded", + // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded", + // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii", + // // "test_nllloss_NCd1d2_with_weight_reduction_sum", + // // "test_nllloss_NCd1d2_with_weight", + // // "test_nllloss_NCd1d2", + // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded", + // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii", + // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded", + // // "test_nllloss_NCd1d2d3_sum_weight_high_ii", + // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded", + // // "test_nllloss_NCd1d2d3d4d5_mean_weight", + // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded", + // // "test_nllloss_NCd1d2d3d4d5_none_no_weight", + // "test_nonmaxsuppression_center_point_box_format", + // "test_nonmaxsuppression_flipped_coordinates", + // "test_nonmaxsuppression_identical_boxes", + // "test_nonmaxsuppression_limit_output_size", + // "test_nonmaxsuppression_single_box", + // "test_nonmaxsuppression_suppress_by_IOU_and_scores", + // "test_nonmaxsuppression_suppress_by_IOU", + // "test_nonmaxsuppression_two_batches", + // "test_nonmaxsuppression_two_classes", + // "test_nonzero_example", + // "test_not_2d", + // "test_not_3d", + // "test_not_4d", + // // "test_onehot_negative_indices", + // // "test_onehot_with_axis", + // // "test_onehot_with_negative_axis", + // // "test_onehot_without_axis", + // // "test_optional_get_element_sequence", + // // "test_optional_get_element", + // // "test_optional_has_element_empty", + // // "test_optional_has_element", + // "test_or_bcast3v1d", + // "test_or_bcast3v2d", + // "test_or_bcast4v2d", + // "test_or_bcast4v3d", + // "test_or_bcast4v4d", + // "test_or2d", + // "test_or3d", + // "test_or4d", + "test_pow_bcast_array", + "test_pow_bcast_scalar", + "test_pow_example", + // "test_pow_types_float", + // "test_pow_types_float32_int32", + // "test_pow_types_float32_int64", + // "test_pow_types_float32_uint32", + // "test_pow_types_float32_uint64", + // "test_pow_types_int", + // "test_pow_types_int32_float32", + // "test_pow_types_int32_int32", + // "test_pow_types_int64_float32", + // "test_pow_types_int64_int64", + "test_pow", + // "test_prelu_broadcast", + // "test_prelu_example", + // // "test_qlinearconv", + // // "test_qlinearmatmul_2D", + // // "test_qlinearmatmul_3D", + // // "test_quantizelinear_axis", + // // "test_quantizelinear", + // "test_range_float_type_positive_delta_expanded", + // "test_range_float_type_positive_delta", + // "test_range_int32_type_negative_delta_expanded", + // "test_range_int32_type_negative_delta", + // "test_reciprocal_example", + // "test_reciprocal", + // "test_reduce_l1_default_axes_keepdims_example", + // "test_reduce_l1_default_axes_keepdims_random", + // "test_reduce_l1_do_not_keepdims_example", + // "test_reduce_l1_do_not_keepdims_random", + // "test_reduce_l1_keep_dims_example", + // "test_reduce_l1_keep_dims_random", + // "test_reduce_l1_negative_axes_keep_dims_example", + // "test_reduce_l1_negative_axes_keep_dims_random", + // "test_reduce_l2_default_axes_keepdims_example", + // "test_reduce_l2_default_axes_keepdims_random", + // "test_reduce_l2_do_not_keepdims_example", + // "test_reduce_l2_do_not_keepdims_random", + // "test_reduce_l2_keep_dims_example", + // "test_reduce_l2_keep_dims_random", + // "test_reduce_l2_negative_axes_keep_dims_example", + // "test_reduce_l2_negative_axes_keep_dims_random", + // "test_reduce_log_sum_asc_axes", + // "test_reduce_log_sum_default", + // "test_reduce_log_sum_desc_axes", + // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64. + // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example", + // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random", + // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example", + // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random", + // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example", + // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random", + // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example", + // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random", + // "test_reduce_log_sum_negative_axes", + // "test_reduce_log_sum", + // "test_reduce_max_default_axes_keepdim_example", + // "test_reduce_max_default_axes_keepdims_random", + // "test_reduce_max_do_not_keepdims_example", + // "test_reduce_max_do_not_keepdims_random", + // "test_reduce_max_keepdims_example", + // "test_reduce_max_keepdims_random", + // "test_reduce_max_negative_axes_keepdims_example", + // "test_reduce_max_negative_axes_keepdims_random", + // "test_reduce_mean_default_axes_keepdims_example", + // "test_reduce_mean_default_axes_keepdims_random", + // "test_reduce_mean_do_not_keepdims_example", + // "test_reduce_mean_do_not_keepdims_random", + // "test_reduce_mean_keepdims_example", + // "test_reduce_mean_keepdims_random", + // "test_reduce_mean_negative_axes_keepdims_example", + // "test_reduce_mean_negative_axes_keepdims_random", + // "test_reduce_min_default_axes_keepdims_example", + // "test_reduce_min_default_axes_keepdims_random", + // "test_reduce_min_do_not_keepdims_example", + // "test_reduce_min_do_not_keepdims_random", + // "test_reduce_min_keepdims_example", + // "test_reduce_min_keepdims_random", + // "test_reduce_min_negative_axes_keepdims_example", + // "test_reduce_min_negative_axes_keepdims_random", + // "test_reduce_prod_default_axes_keepdims_example", + // "test_reduce_prod_default_axes_keepdims_random", + // "test_reduce_prod_do_not_keepdims_example", + // "test_reduce_prod_do_not_keepdims_random", + // "test_reduce_prod_keepdims_example", + // "test_reduce_prod_keepdims_random", + // "test_reduce_prod_negative_axes_keepdims_example", + // "test_reduce_prod_negative_axes_keepdims_random", + // "test_reduce_sum_default_axes_keepdims_example", + // "test_reduce_sum_default_axes_keepdims_random", + // "test_reduce_sum_do_not_keepdims_example", + // "test_reduce_sum_do_not_keepdims_random", + // "test_reduce_sum_empty_axes_input_noop_example", + // "test_reduce_sum_empty_axes_input_noop_random", + // "test_reduce_sum_keepdims_example", + // "test_reduce_sum_keepdims_random", + // "test_reduce_sum_negative_axes_keepdims_example", + // "test_reduce_sum_negative_axes_keepdims_random", + // "test_reduce_sum_square_default_axes_keepdims_example", + // "test_reduce_sum_square_default_axes_keepdims_random", + // "test_reduce_sum_square_do_not_keepdims_example", + // "test_reduce_sum_square_do_not_keepdims_random", + // "test_reduce_sum_square_keepdims_example", + // "test_reduce_sum_square_keepdims_random", + // "test_reduce_sum_square_negative_axes_keepdims_example", + // "test_reduce_sum_square_negative_axes_keepdims_random", + // "test_reflect_pad", + "test_relu", + // "test_reshape_allowzero_reordered", + // "test_reshape_extended_dims", + // "test_reshape_negative_dim", + // "test_reshape_negative_extended_dims", + // "test_reshape_one_dim", + // "test_reshape_reduced_dims", + // "test_reshape_reordered_all_dims", + // "test_reshape_reordered_dims", + // "test_reshape_reordered_last_dims", + // "test_reshape_zero_and_negative_dim", + // "test_reshape_zero_dim", + // "test_resize_downsample_linear", + // "test_resize_downsample_nearest", + // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside", + // "test_resize_downsample_scales_cubic_align_corners", + // "test_resize_downsample_scales_cubic", + // "test_resize_downsample_scales_linear_align_corners", + // "test_resize_downsample_scales_linear", + // "test_resize_downsample_scales_nearest", + // "test_resize_downsample_sizes_cubic", + // "test_resize_downsample_sizes_linear_pytorch_half_pixel", + // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn", + // "test_resize_downsample_sizes_nearest", + // "test_resize_nearest", + // "test_resize_tf_crop_and_resize", + // "test_resize_upsample_linear", + // "test_resize_upsample_nearest", + // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside", + // "test_resize_upsample_scales_cubic_align_corners", + // "test_resize_upsample_scales_cubic_asymmetric", + // "test_resize_upsample_scales_cubic", + // "test_resize_upsample_scales_linear_align_corners", + // "test_resize_upsample_scales_linear", + // "test_resize_upsample_scales_nearest", + // "test_resize_upsample_sizes_cubic", + // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel", + // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners", + // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric", + // "test_resize_upsample_sizes_nearest", + // // "test_reversesequence_batch", + // // "test_reversesequence_time", + // // "test_rnn_seq_length", + // // "test_roialign_aligned_false", + // // "test_roialign_aligned_true", + // // "test_roialign", + // // "test_round", + // // "test_scan_sum", + // // "test_scan9_sum", + // // "test_scatter_elements_with_axis", + // // "test_scatter_elements_with_duplicate_indices", + // // "test_scatter_elements_with_negative_indices", + // // "test_scatter_elements_without_axis", + // // "test_scatter_with_axis", + // // "test_scatter_without_axis", + // // "test_scatternd_add", + // // "test_scatternd_multiply", + // // "test_scatternd", + // // "test_sce_mean_3d_expanded", + // // "test_sce_mean_3d_log_prob_expanded", + // // "test_sce_mean_3d_log_prob", + // // "test_sce_mean_3d", + // // "test_sce_mean_expanded", + // // "test_sce_mean_log_prob_expanded", + // // "test_sce_mean_log_prob", + // // "test_sce_mean_no_weight_ii_3d_expanded", + // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded", + // // "test_sce_mean_no_weight_ii_3d_log_prob", + // // "test_sce_mean_no_weight_ii_3d", + // // "test_sce_mean_no_weight_ii_4d_expanded", + // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded", + // // "test_sce_mean_no_weight_ii_4d_log_prob", + // // "test_sce_mean_no_weight_ii_4d", + // // "test_sce_mean_no_weight_ii_expanded", + // // "test_sce_mean_no_weight_ii_log_prob_expanded", + // // "test_sce_mean_no_weight_ii_log_prob", + // // "test_sce_mean_no_weight_ii", + // // "test_sce_mean_weight_expanded", + // // "test_sce_mean_weight_ii_3d_expanded", + // // "test_sce_mean_weight_ii_3d_log_prob_expanded", + // // "test_sce_mean_weight_ii_3d_log_prob", + // // "test_sce_mean_weight_ii_3d", + // // "test_sce_mean_weight_ii_4d_expanded", + // // "test_sce_mean_weight_ii_4d_log_prob_expanded", + // // "test_sce_mean_weight_ii_4d_log_prob", + // // "test_sce_mean_weight_ii_4d", + // // "test_sce_mean_weight_ii_expanded", + // // "test_sce_mean_weight_ii_log_prob_expanded", + // // "test_sce_mean_weight_ii_log_prob", + // // "test_sce_mean_weight_ii", + // // "test_sce_mean_weight_log_prob_expanded", + // // "test_sce_mean_weight_log_prob", + // // "test_sce_mean_weight", + // // "test_sce_mean", + // // "test_sce_NCd1_mean_weight_negative_ii_expanded", + // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded", + // // "test_sce_NCd1_mean_weight_negative_ii_log_prob", + // // "test_sce_NCd1_mean_weight_negative_ii", + // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded", + // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded", + // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob", + // // "test_sce_NCd1d2d3_none_no_weight_negative_ii", + // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded", + // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded", + // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob", + // // "test_sce_NCd1d2d3_sum_weight_high_ii", + // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded", + // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded", + // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob", + // // "test_sce_NCd1d2d3d4d5_mean_weight", + // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded", + // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded", + // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob", + // // "test_sce_NCd1d2d3d4d5_none_no_weight", + // // "test_sce_none_expanded", + // // "test_sce_none_log_prob_expanded", + // // "test_sce_none_log_prob", + // // "test_sce_none_weights_expanded", + // // "test_sce_none_weights_log_prob_expanded", + // // "test_sce_none_weights_log_prob", + // // "test_sce_none_weights", + // // "test_sce_none", + // // "test_sce_sum_expanded", + // // "test_sce_sum_log_prob_expanded", + // // "test_sce_sum_log_prob", + // // "test_sce_sum", + // "test_selu_default", + // "test_selu_example", + // "test_selu", + // // "test_sequence_insert_at_back", + // // "test_sequence_insert_at_front", + // // "test_sequence_map_add_1_sequence_1_tensor_expanded", + // // "test_sequence_map_add_1_sequence_1_tensor", + // // "test_sequence_map_add_2_sequences_expanded", + // // "test_sequence_map_add_2_sequences", + // // "test_sequence_map_extract_shapes_expanded", + // // "test_sequence_map_extract_shapes", + // // "test_sequence_map_identity_1_sequence_1_tensor_expanded", + // // "test_sequence_map_identity_1_sequence_1_tensor", + // // "test_sequence_map_identity_1_sequence_expanded", + // // "test_sequence_map_identity_1_sequence", + // // "test_sequence_map_identity_2_sequences_expanded", + // // "test_sequence_map_identity_2_sequences", + // "test_shrink_hard", + // "test_shrink_soft", + "test_sigmoid_example", + "test_sigmoid", + // "test_sign", + // "test_simple_rnn_batchwise", + // "test_simple_rnn_defaults", + // "test_simple_rnn_with_initial_bias", + "test_sin_example", + "test_sin", + // "test_sinh_example", + // "test_sinh", + // // "test_size_example", + // // "test_size", + // "test_slice_default_axes", + // "test_slice_default_steps", + // "test_slice_end_out_of_bounds", + // "test_slice_neg_steps", + // "test_slice_neg", + // "test_slice_negative_axes", + // "test_slice_start_out_of_bounds", + // "test_slice", + // "test_softmax_axis_0_expanded", + "test_softmax_axis_0", + // "test_softmax_axis_1_expanded", + "test_softmax_axis_1", + // "test_softmax_axis_2_expanded", + "test_softmax_axis_2", + // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob", + // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob", + // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight", + // "test_softmax_cross_entropy_mean_3d_expanded", + // "test_softmax_cross_entropy_mean_3d_log_prob_expanded", + // "test_softmax_cross_entropy_mean_3d_log_prob", + // "test_softmax_cross_entropy_mean_3d", + // "test_softmax_cross_entropy_mean_expanded", + // "test_softmax_cross_entropy_mean_log_prob_expanded", + // "test_softmax_cross_entropy_mean_log_prob", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob", + // "test_softmax_cross_entropy_mean_no_weight_ignore_index", + // "test_softmax_cross_entropy_mean_weight_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob", + // "test_softmax_cross_entropy_mean_weight_ignore_index_3d", + // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob", + // "test_softmax_cross_entropy_mean_weight_ignore_index_4d", + // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded", + // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob", + // "test_softmax_cross_entropy_mean_weight_ignore_index", + // "test_softmax_cross_entropy_mean_weight_log_prob_expanded", + // "test_softmax_cross_entropy_mean_weight_log_prob", + // "test_softmax_cross_entropy_mean_weight", + // "test_softmax_cross_entropy_mean", + // "test_softmax_cross_entropy_none_expanded", + // "test_softmax_cross_entropy_none_log_prob_expanded", + // "test_softmax_cross_entropy_none_log_prob", + // "test_softmax_cross_entropy_none_weights_expanded", + // "test_softmax_cross_entropy_none_weights_log_prob_expanded", + // "test_softmax_cross_entropy_none_weights_log_prob", + // "test_softmax_cross_entropy_none_weights", + // "test_softmax_cross_entropy_none", + // "test_softmax_cross_entropy_sum_expanded", + // "test_softmax_cross_entropy_sum_log_prob_expanded", + // "test_softmax_cross_entropy_sum_log_prob", + // "test_softmax_cross_entropy_sum", + // "opset13/test_softmax_default_axis_expanded", + "opset13/test_softmax_default_axis", + // "test_softmax_example_expanded", + "test_softmax_example", + // "test_softmax_large_number_expanded", + "test_softmax_large_number", + // "test_softmax_negative_axis_expanded", + "test_softmax_negative_axis", + // // "test_softplus_example", + // // "test_softplus", + // // "test_softsign_example", + // // "test_softsign", + // "test_spacetodepth_example", + // "test_spacetodepth", + // "test_split_equal_parts_1d", + // "test_split_equal_parts_2d", + // "test_split_equal_parts_default_axis", + // "test_split_variable_parts_1d", + // "test_split_variable_parts_2d", + // "test_split_variable_parts_default_axis", + // "test_split_zero_size_splits", + "test_sqrt_example", + "test_sqrt", + // "test_squeeze_negative_axes", + // "test_squeeze", + // // "test_stft_with_window", + // // "test_stft", + // // "test_strnormalizer_export_monday_casesensintive_lower", + // // "test_strnormalizer_export_monday_casesensintive_nochangecase", + // // "test_strnormalizer_export_monday_casesensintive_upper", + // // "test_strnormalizer_export_monday_empty_output", + // // "test_strnormalizer_export_monday_insensintive_upper_twodim", + // // "test_strnormalizer_nostopwords_nochangecase", + "test_sub_bcast", + "test_sub_example", + // "test_sub_uint8", + "test_sub", + // "test_sum_example", + // "test_sum_one_input", + // "test_sum_two_inputs", + "test_tan_example", + "test_tan", + "test_tanh_example", + "test_tanh", + // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0", + // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5", + // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5", + // // "test_tfidfvectorizer_tf_only_bigrams_skip0", + // // "test_tfidfvectorizer_tf_onlybigrams_levelempty", + // // "test_tfidfvectorizer_tf_onlybigrams_skip5", + // // "test_tfidfvectorizer_tf_uniandbigrams_skip5", + // "test_thresholdedrelu_default", + // "test_thresholdedrelu_example", + // "test_thresholdedrelu", + // "test_tile_precomputed", + // "test_tile", + // // "test_top_k_negative_axis", + // // "test_top_k_smallest", + // // "test_top_k", + // // "test_training_dropout_default_mask", + // // "test_training_dropout_default", + // // "test_training_dropout_mask", + // // "test_training_dropout_zero_ratio_mask", + // // "test_training_dropout_zero_ratio", + // // "test_training_dropout", + "test_transpose_all_permutations_0", + "test_transpose_all_permutations_1", + "test_transpose_all_permutations_2", + "test_transpose_all_permutations_3", + "test_transpose_all_permutations_4", + "test_transpose_all_permutations_5", + "test_transpose_default" + // "test_tril_neg", + // "test_tril_one_row_neg", + // "test_tril_out_neg", + // "test_tril_out_pos", + // "test_tril_pos", + // "test_tril_square_neg", + // "test_tril_square", + // "test_tril_zero", + // "test_tril", + // "test_triu_neg", + // "test_triu_one_row", + // "test_triu_out_neg_out", + // "test_triu_out_pos", + // "test_triu_pos", + // "test_triu_square_neg", + // "test_triu_square", + // "test_triu_zero", + // "test_triu", + // // "test_unique_not_sorted_without_axis", + // // "test_unique_sorted_with_axis_3d", + // // "test_unique_sorted_with_axis", + // // "test_unique_sorted_with_negative_axis", + // // "test_unique_sorted_without_axis", + // "test_unsqueeze_axis_0", + // "test_unsqueeze_axis_1", + // "test_unsqueeze_axis_2", + // "test_unsqueeze_axis_3", + // "test_unsqueeze_negative_axes", + // "test_unsqueeze_three_axes", + // "test_unsqueeze_two_axes", + // "test_unsqueeze_unsorted_axes", + // "test_unsqueeze", + // "test_wrap_pad" + // "test_upsample_nearest", + // "test_where_example", + // "test_where_long_example", + // "test_xor_bcast3v1d", + // "test_xor_bcast3v2d", + // "test_xor_bcast4v2d", + // "test_xor_bcast4v3d", + // "test_xor_bcast4v4d", + // "test_xor2d", + // "test_xor3d", + // "test_xor4d" ], "ops": [] } diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts index 9bd0ec1425f95..2d83ce1e095ce 100644 --- a/js/web/test/test-main.ts +++ b/js/web/test/test-main.ts @@ -110,8 +110,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) { let context: ModelTestContext; before('prepare session', async () => { - context = await ModelTestContext.create( - test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions); + context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options); }); after('release session', async () => { diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index 5e9b0910a2c68..6d5951be7b1e6 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -137,8 +137,9 @@ async function loadTensors( } async function initializeSession( - modelFilePath: string, backendHint: string, ioBindingMode: Test.IOBindingMode, profile: boolean, - sessionOptions: ort.InferenceSession.SessionOptions, fileCache?: FileCacheBuffer): Promise { + modelFilePath: string, backendHint: ort.InferenceSession.ExecutionProviderConfig, ioBindingMode: Test.IOBindingMode, + profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions, + fileCache?: FileCacheBuffer): Promise { const preloadModelData: Uint8Array|undefined = fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined; Logger.verbose( @@ -232,9 +233,8 @@ export class ModelTestContext { /** * create a ModelTestContext object that used in every test cases in the given ModelTest. */ - static async create( - modelTest: Test.ModelTest, profile: boolean, - sessionOptions?: ort.InferenceSession.SessionOptions): Promise { + static async create(modelTest: Test.ModelTest, profile: boolean, testOptions?: Test.Options): + Promise { if (this.initializing) { throw new Error('cannot create a ModelTestContext object when the previous creation is not done'); } @@ -243,8 +243,12 @@ export class ModelTestContext { this.initializing = true; const initStart = now(); + const executionProviderConfig = + modelTest.backend === 'webnn' ? (testOptions?.webnnOptions || 'webnn') : modelTest.backend!; const session = await initializeSession( - modelTest.modelUrl, modelTest.backend!, modelTest.ioBinding, profile, sessionOptions || {}, this.cache); + modelTest.modelUrl, executionProviderConfig, modelTest.ioBinding, profile, testOptions?.sessionOptions || {}, + this.cache); + const initEnd = now(); for (const testCase of modelTest.cases) { diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts index 5bdc8d84cc7a5..cd008e82e570b 100644 --- a/js/web/test/test-types.ts +++ b/js/web/test/test-types.ts @@ -143,6 +143,7 @@ export declare namespace Test { cudaFlags?: Record; wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption; webglOptions?: InferenceSession.WebGLExecutionProviderOption; + webnnOptions?: InferenceSession.WebNNExecutionProviderOption; globalEnvFlags?: EnvOptions; } diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h index c9ed23895b60c..da489a6901512 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h @@ -133,6 +133,10 @@ constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FL // Default value for the above setting. constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513; +// Environment variable to enable loading more KV data in flight in +// DecoderMaskedMultiHeadAttention/DecoderMaskedSelfAttention kernels +constexpr const char* kDecoderMaskedAttentionLoadKVDataInFlight = "ORT_DECODER_MASKED_ATTENTION_LOAD_KV_DATA_IN_FLIGHT"; + } // namespace attention } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc index 54aad9cbaf387..a9b60da0c96ca 100644 --- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc @@ -70,6 +70,10 @@ Status DecoderMaskedMultiHeadAttention::ComputeInternal(OpKernelContext* auto& device_prop = GetDeviceProp(); DecoderMaskedMultiHeadAttentionParams parameters; + + parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault( + attention::kDecoderMaskedAttentionLoadKVDataInFlight, false); + bool is_dmmha_packing = (key == nullptr && value == nullptr); ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs(query, key, diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc index 69ed07101e647..72ede2e22b557 100644 --- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc @@ -52,6 +52,10 @@ Status DecoderMaskedSelfAttention::ComputeInternal(OpKernelContext* cont auto& device_prop = GetDeviceProp(); DecoderMaskedMultiHeadAttentionParams parameters; + + parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault( + attention::kDecoderMaskedAttentionLoadKVDataInFlight, false); + ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(), weights->Shape(), bias->Shape(), diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu index 33e7a33494778..9efb6f08e8e99 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu @@ -344,52 +344,148 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio bool has_beams = params.cache_indir != nullptr && !params.is_cross_attention; const int* beam_indices = has_beams ? ¶ms.cache_indir[bi_max_seq_length] : nullptr; - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0); + if (!params.kv_data_in_flight) { + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0); - // The keys loaded from the key cache. - K_vec_k k_vec[K_VECS_PER_THREAD]; - if (ti < tlength) { - if (has_beams) { - const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size; + // The keys loaded from the key cache. + K_vec_k k_vec[K_VECS_PER_THREAD]; + if (ti < tlength) { + if (has_beams) { + const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size; #pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * params.max_sequence_length + ti; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * params.max_sequence_length + ti; - k_vec[ii] = vec_conversion( - (*reinterpret_cast(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]))); - } - } else { + k_vec[ii] = vec_conversion( + (*reinterpret_cast(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]))); + } + } else { #pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * params.max_sequence_length + ti; + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * params.max_sequence_length + ti; - k_vec[ii] = vec_conversion( - (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B]))); + k_vec[ii] = vec_conversion( + (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B]))); + } } } - } - // Perform the dot product and normalize qk. - // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!! - float qk = Qk_dot::dot(q_vec, k_vec) * inv_sqrt_dh; + // Perform the dot product and normalize qk. + // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!! + float qk = Qk_dot::dot(q_vec, k_vec) * inv_sqrt_dh; - // This is a deviation from FasterTransformer kernel implementation - // but this aligns with ORT's other Attention kernels which strives to - // mimic PyTorch when dealing with mask filter values - if (is_masked) { - qk += params.mask_filter_value; + // This is a deviation from FasterTransformer kernel implementation + // but this aligns with ORT's other Attention kernels which strives to + // mimic PyTorch when dealing with mask filter values + if (is_masked) { + qk += params.mask_filter_value; + } + + // Store the product to shared memory. There's one qk value per timestep. Update the max. + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + if (params.relative_attention_bias != nullptr) { + qk = add_vec(qk, + reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]); + } + qk_max = fmaxf(qk_max, qk); + qk_smem[ti] = qk; + } } + } else { + // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model + // Also tune it for different architectures. This works best for Whisper on 80GB A100. + constexpr int K_CACHE_DATA_LOAD_UNROLL = 4; - // Store the product to shared memory. There's one qk value per timestep. Update the max. - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - if (params.relative_attention_bias != nullptr) { - qk = add_vec(qk, - reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]); + for (int ti = ko; ti < ti_end; ti += (K_CACHE_DATA_LOAD_UNROLL * K_PER_ITER)) { + int is_masked[K_CACHE_DATA_LOAD_UNROLL]; + int beam_offset[K_CACHE_DATA_LOAD_UNROLL]; + int time_step[K_CACHE_DATA_LOAD_UNROLL]; + bool time_bounds_cond[K_CACHE_DATA_LOAD_UNROLL]; + +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + is_masked[k_unroll] = 1; + beam_offset[k_unroll] = 0; + time_step[k_unroll] = ti + k_unroll * K_PER_ITER; + time_bounds_cond[k_unroll] = (time_step[k_unroll] < tlength); + } + +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + if (time_bounds_cond[k_unroll] && params.mask != nullptr) { + is_masked[k_unroll] = params.mask[bi_total_seq_length + time_step[k_unroll]]; + } + } + + if (has_beams) { + int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size; + +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + if (time_bounds_cond[k_unroll]) { + beam_offset[k_unroll] = beam_indices[time_step[k_unroll]] * head_maxlength_headsize_prod; + } + } + } + + // The keys loaded from the key cache. + K_vec_k k_vec[K_CACHE_DATA_LOAD_UNROLL][K_VECS_PER_THREAD]; + +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + if (time_bounds_cond[k_unroll]) { + if (has_beams) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * params.max_sequence_length + time_step[k_unroll]; + + k_vec[k_unroll][ii] = vec_conversion( + (*reinterpret_cast(&k_cache_batch[beam_offset[k_unroll] + jj * QK_ELTS_IN_16B]))); + } + } else { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * params.max_sequence_length + time_step[k_unroll]; + + k_vec[k_unroll][ii] = vec_conversion( + (*reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B]))); + } + } + } + } + + // Perform the dot product and normalize qk. + // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!! + float qk[K_CACHE_DATA_LOAD_UNROLL]; +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + qk[k_unroll] = Qk_dot::dot(q_vec, k_vec[k_unroll]) * inv_sqrt_dh; + } + +// This is a deviation from FasterTransformer kernel implementation +// but this aligns with ORT's other Attention kernels which strives to +// mimic PyTorch when dealing with mask filter values +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + if (time_bounds_cond[k_unroll] && is_masked[k_unroll] == 0) { + qk[k_unroll] += params.mask_filter_value; + } + } + +// Store the product to shared memory. There's one qk value per timestep. Update the max. +#pragma unroll + for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) { + if (time_bounds_cond[k_unroll] && (tidx % THREADS_PER_KEY == 0)) { + if (params.relative_attention_bias != nullptr) { + qk[k_unroll] = add_vec(qk[k_unroll], + reinterpret_cast(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + time_step[k_unroll]]); + } + qk_max = fmaxf(qk_max, qk[k_unroll]); + qk_smem[time_step[k_unroll]] = qk[k_unroll]; + } } - qk_max = fmaxf(qk_max, qk); - qk_smem[ti] = qk; } } @@ -504,18 +600,80 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio V_vec_acum out; zero(out); - // Loop over the timesteps to compute the partial outputs. - for (int ti = vo; ti < tlength; ti += V_PER_ITER) { - // Fetch offset based on cache_indir when beam sampling - const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0; - const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0; + if (!params.kv_data_in_flight) { + // Loop over the timesteps to compute the partial outputs. + for (int ti = vo; ti < tlength; ti += V_PER_ITER) { + // Fetch offset based on cache_indir when beam sampling + const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0; + const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0; + + // Load the values from the cache. + V_vec_k v = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset + ti * head_size])); + + // Load the logits from shared memory. + T logit = logits_smem[ti]; + out = fma(logit, v, out); + } + } else { + // Loop over the timesteps to compute the partial outputs. + + // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model + // Also tune it for different architectures. This works best for Whisper on 80GB A100. + constexpr int V_CACHE_DATA_LOAD_UNROLL = 8; + + for (int ti = vo; ti < tlength; ti += V_CACHE_DATA_LOAD_UNROLL * V_PER_ITER) { + int beam_src[V_CACHE_DATA_LOAD_UNROLL]; + int beam_offset[V_CACHE_DATA_LOAD_UNROLL]; + int time_step[V_CACHE_DATA_LOAD_UNROLL]; + bool time_bounds_cond[V_CACHE_DATA_LOAD_UNROLL]; + +#pragma unroll + for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) { + beam_src[v_unroll] = 0; + beam_offset[v_unroll] = 0; + time_step[v_unroll] = ti + v_unroll * V_PER_ITER; + time_bounds_cond[v_unroll] = (time_step[v_unroll] < tlength); + } + + int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size; + + if (has_beams) { +// Do the global memory read and corresponding compute in separate unrolled loops +#pragma unroll + for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) { + if (time_bounds_cond[v_unroll]) { + beam_src[v_unroll] = params.cache_indir[bi_max_seq_length + time_step[v_unroll]]; + } + } + +#pragma unroll + for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) { + if (time_bounds_cond[v_unroll]) { + beam_offset[v_unroll] = beam_src[v_unroll] * head_maxlength_headsize_prod; + } + } + } - // Load the values from the cache. - V_vec_k v = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset + ti * head_size])); + // Load the values from the V-cache and logits from shared memory. + V_vec_k v[V_CACHE_DATA_LOAD_UNROLL]; + T logits[V_CACHE_DATA_LOAD_UNROLL]; - // Load the logits from shared memory. - T logit = logits_smem[ti]; - out = fma(logit, v, out); +// Do the global memory read and compute in separate unrolled loops +#pragma unroll + for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) { + if (time_bounds_cond[v_unroll]) { + v[v_unroll] = vec_conversion(*reinterpret_cast(&v_cache_batch[beam_offset[v_unroll] + time_step[v_unroll] * head_size])); + logits[v_unroll] = logits_smem[time_step[v_unroll]]; + } + } + +#pragma unroll + for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) { + if (time_bounds_cond[v_unroll]) { + out = fma(logits[v_unroll], v[v_unroll], out); + } + } + } } // One group of threads computes the product(s) for the current timestep. diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h index 4b408dafa2d81..1a17757d1ec2d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h @@ -22,6 +22,12 @@ struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters { bool is_cross_attention = false; bool is_packed_qkv = false; + // Useful to better use global memory bandwidth on certain CUDA architectures. + // Turned off by default for now until we fully understand performance implications + // for all types of workloads. + // Can be turned on by appropriate environment variable (see attention_common.h). + bool kv_data_in_flight = false; + void* q = nullptr; void* q_bias = nullptr; @@ -62,4 +68,4 @@ void mmha_launch_kernel(const DecoderMaskedMultiHeadAttentionParams& params, cud } // namespace cuda } // namespace contrib -} // namespace onnxruntime +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc index d597e0d57fbcb..63804f79a32fb 100644 --- a/onnxruntime/contrib_ops/rocm/fused_conv.cc +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -76,7 +76,12 @@ struct FNVHash { void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) { int spatial_dim = 1; #if ROCM_VERSION >= 50500 - miopenGetConvolutionSpatialDim(cdesc, &spatial_dim); + MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim)); + std::vector pads{spatial_dim}; + std::vector strides{spatial_dim}; + std::vector dilations{spatial_dim}; + miopenConvolutionMode_t mode; + MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode)); #else // Previous versions of MIOpen doesn't provide API to probe the dimension of a // miopenConvolutionDescriptor_t, so we have to guess. @@ -100,11 +105,12 @@ struct FNVHash { pads.resize(spatial_dim); strides.resize(spatial_dim); dilations.resize(spatial_dim); +#endif (*this) << spatial_dim; (*this) << pads; (*this) << strides; (*this) << dilations; -#endif + (*this) << mode; } private: @@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv { auto ret = miopenCompileFusionPlan(handle, fusion->plan); if (miopenStatusSuccess == ret) { fusion->compiled_on.insert(handle); + } else { + return ret; } return miopenStatusSuccess; } diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h index 40c59cfcf699d..796a018ac0f68 100644 --- a/onnxruntime/core/framework/session_options.h +++ b/onnxruntime/core/framework/session_options.h @@ -65,6 +65,11 @@ struct FreeDimensionOverride { * Configuration information for a session. */ struct SessionOptions { +#if defined(__wasm__) && defined(__EMSCRIPTEN_PTHREADS__) + static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = false; +#else + static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = true; +#endif ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL; // set the execution order of the graph @@ -129,7 +134,8 @@ struct SessionOptions { // By default the session uses its own set of threadpools, unless this is set to false. // Use this in conjunction with the CreateEnvWithGlobalThreadPools API. - bool use_per_session_threads = true; + bool use_per_session_threads = DEFAULT_USE_PER_SESSION_THREADS; + bool thread_pool_allow_spinning = true; // Deterministic compute is likely not as performant. This option is default to false. diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h index f8f7dcd43699f..90fdd710e2773 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h @@ -232,7 +232,7 @@ MlasSQNBitGemmOperation( size_t RowsRemaining = RangeCountM; while (RowsRemaining > 0) { -#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) +#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64) auto RowsHandled = GetMlasPlatform().GemmFloatKernel( a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true ); diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc index cc0f7854791d4..9d53e28921784 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc @@ -53,7 +53,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output MakeString("Added by ", kTransformerName), dq_inputs, {&new_dq_output_nodearg}, - nullptr, // attributes + &original_dq_node.GetAttributes(), original_dq_node.Domain()); // set up edges diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h index d2f297e83aedb..f5288d7f231b0 100644 --- a/onnxruntime/core/providers/acl/math/gemm.h +++ b/onnxruntime/core/providers/acl/math/gemm.h @@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm { } Status Compute(OpKernelContext* context) const override { +#ifdef ACL_2308 + if (this->packed_b_) { + // Prepacked RHS not supported, defaulting to cpu execution provider + return onnxruntime::Gemm::Compute(context); + } +#endif const auto A = context->Input(0); const auto B = context->Input(1); const auto C = context->Input(2); - GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape()); + GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, + C != nullptr ? C->Shape() : TensorShape({})); if (!helper.State().IsOK()) return helper.State(); @@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm { return onnxruntime::Gemm::Compute(context); } - arm_compute::TensorShape cShape = ACLTensorShape(C->Shape()); + arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({})); if (useC && (cShape.num_dimensions() > 2 || (cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) { // Multi-dimensional Bias @@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm { (cShape[1] == 1 && cShape[0] != (long unsigned int)N)) { return onnxruntime::Gemm::Compute(context); } +#ifdef ACL_2308 + cShape = arm_compute::TensorShape(N); + LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}"; +#else cShape = arm_compute::TensorShape(1, N); LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}"; +#endif } int64_t K = helper.K(); diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc index da7fff730c96f..eb6a10074f1db 100755 --- a/onnxruntime/core/providers/acl/nn/batch_norm.cc +++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc @@ -44,6 +44,16 @@ Status BatchNorm::Compute(OpKernelContext* context) const { const Tensor* M = context->Input(3); // mean const Tensor* V = context->Input(4); // var + if (S->Shape().NumDimensions() > 1) { + LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation"; + return onnxruntime::BatchNorm::Compute(context); + } + + if (this->is_train_) { + LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation"; + return onnxruntime::BatchNorm::Compute(context); + } + ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V)); LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:"; @@ -70,7 +80,23 @@ Status BatchNorm::Compute(OpKernelContext* context) const { auto layer = std::make_shared(); +#ifdef ACL_2308 + arm_compute::TensorShape in_x_shape; + const TensorShape& x_shape = X->Shape(); + const auto& dims_vec = x_shape.GetDims(); + in_x_shape.set(3, onnxruntime::narrow(dims_vec[0])); // N + in_x_shape.set(1, 1); // H + size_t W = 1; + for (size_t i = 2; i < dims_vec.size(); ++i) { + W *= narrow(dims_vec[i]); + } + in_x_shape.set(0, W); // W + in_x_shape.set(2, onnxruntime::narrow(dims_vec[1])); // C + + tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32)); +#else tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32)); +#endif tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32)); tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32)); @@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( 7, 9, kAclExecutionProvider, KernelDefBuilder() - .TypeConstraint("X", DataTypeImpl::GetTensorType()) - .TypeConstraint("scale", DataTypeImpl::GetTensorType()) - .TypeConstraint("B", DataTypeImpl::GetTensorType()) - .TypeConstraint("mean", DataTypeImpl::GetTensorType()) - .TypeConstraint("var", DataTypeImpl::GetTensorType()), + .TypeConstraint("T", DataTypeImpl::GetTensorType()), BatchNorm); } // namespace acl diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.h b/onnxruntime/core/providers/acl/nn/batch_norm.h index c9ec08b67a779..264301976e6dc 100755 --- a/onnxruntime/core/providers/acl/nn/batch_norm.h +++ b/onnxruntime/core/providers/acl/nn/batch_norm.h @@ -31,9 +31,9 @@ typedef struct { typedef std::map::iterator BatchNormLayersIterator; template -class BatchNorm final : public OpKernel { +class BatchNorm : public onnxruntime::BatchNorm { public: - explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) { + explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm(info) { auto st = info.GetAttr("epsilon", &epsilon_); ORT_ENFORCE(st.IsOK(), st.ErrorMessage()); diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc index 1613d927d0f74..85bd0cfe96279 100644 --- a/onnxruntime/core/providers/acl/nn/conv.cc +++ b/onnxruntime/core/providers/acl/nn/conv.cc @@ -105,7 +105,11 @@ Status Conv::Compute(OpKernelContext* context) const { TensorShapeVector Y_dims; Y_dims.insert(Y_dims.begin(), {N, M}); TensorShape input_shape = X->Shape().Slice(2); +#ifdef ACL_2308 + ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims)); +#else ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims)); +#endif Tensor* Y = context->Output(0, TensorShape(Y_dims)); LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str(); @@ -222,6 +226,15 @@ Status Conv::Compute(OpKernelContext* context) const { 1 /* depth multiplier */, acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(), arm_compute::Size2D(aclDilation0, dilations[0]))); +#elif defined(ACL_2308) + bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(), + tconv.k->info(), + (B != nullptr) ? tconv.b->info() : nullptr, + tconv.out->info(), + aclPadStride, + 1 /* depth multiplier */, + acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(), + arm_compute::Size2D(aclDilation0, dilations[0]))); #endif if (optimizable) { @@ -230,7 +243,7 @@ Status Conv::Compute(OpKernelContext* context) const { auto layer = std::make_shared(); #elif defined(ACL_1908) auto layer = std::make_shared(); -#elif defined(ACL_2002) +#elif defined(ACL_2002) || defined(ACL_2308) auto layer = std::make_shared(); #endif @@ -238,7 +251,7 @@ Status Conv::Compute(OpKernelContext* context) const { layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(), aclPadStride, 1 /* depth multiplier */, acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo()); -#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) +#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308) layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(), aclPadStride, 1 /* depth multiplier */, acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(), diff --git a/onnxruntime/core/providers/acl/nn/conv.h b/onnxruntime/core/providers/acl/nn/conv.h index ecb11fb3c8f4e..660d47b4172df 100644 --- a/onnxruntime/core/providers/acl/nn/conv.h +++ b/onnxruntime/core/providers/acl/nn/conv.h @@ -8,6 +8,9 @@ #include "core/providers/acl/acl_execution_provider.h" // ACL +#ifdef ACL_2308 +#include "arm_compute/runtime/Tensor.h" +#endif #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/TensorAllocator.h" #include "arm_compute/runtime/Allocator.h" diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc index dc79ae65bf21e..8fbcba3ed87a7 100644 --- a/onnxruntime/core/providers/acl/nn/pool.cc +++ b/onnxruntime/core/providers/acl/nn/pool.cc @@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context, tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32)); if (pool_attrs.global_pooling) { - layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type)); + layer->configure(tpool.in.get(), + tpool.out.get(), + arm_compute::PoolingLayerInfo(pool_type +#ifdef ACL_2308 + , + arm_compute::DataLayout::NCHW +#endif + )); } else { TensorShapeVector aclStrides(2); aclStrides[0] = (strides.size() == 2) ? strides[1] : 1; @@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context, LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}"; LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding; - arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding); + arm_compute::PoolingLayerInfo pool_info(pool_type, + aclSize, +#ifdef ACL_2308 + arm_compute::DataLayout::NCHW, +#endif + aclPadStride, + excludePadding); layer->configure(tpool.in.get(), tpool.out.get(), pool_info); } diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc index 081472729cfcf..75eedaac80aea 100644 --- a/onnxruntime/core/providers/acl/tensor/concat.cc +++ b/onnxruntime/core/providers/acl/tensor/concat.cc @@ -10,6 +10,8 @@ #include "core/providers/acl/acl_common.h" #include "core/providers/acl/acl_fwd.h" +#include + #define PREF_DIM 4 namespace onnxruntime { @@ -22,17 +24,27 @@ Status Concat::Compute(OpKernelContext* ctx) const { return onnxruntime::Concat::Compute(ctx); } + if (axis_ < 0) { + LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation"; + return onnxruntime::Concat::Compute(ctx); + } + // Number of input tensors to concatenate auto input_count = Node().InputArgCount().front(); // Hold pointers to the input tensors to be used in the PrepareForCompute() step std::vector input_tensors; - input_tensors.reserve(input_count); + int empty_tensors = 0; for (int i = 0; i < input_count; ++i) { + if (ctx->Input(i)->Shape().Size() == 0) { + empty_tensors++; + continue; + } input_tensors.push_back(ctx->Input(i)); } + input_count -= empty_tensors; - auto output_dims = input_tensors[0]->Shape().AsShapeVector(); + auto output_dims = ctx->Input(0)->Shape().AsShapeVector(); // 'Concat' mode if (!is_stack_) { @@ -64,7 +76,11 @@ Status Concat::Compute(OpKernelContext* ctx) const { LOGS_DEFAULT(VERBOSE) << "Concat ACL:"; arm_compute::Tensor output; +#ifdef ACL_2308 + std::vector inputs_vector; +#else std::vector inputs_vector; +#endif for (int i = 0; i < input_count; i++) { arm_compute::Tensor* input = new arm_compute::Tensor(); auto X = input_tensors[i]; @@ -75,7 +91,9 @@ Status Concat::Compute(OpKernelContext* ctx) const { } arm_compute::NEConcatenateLayer layer; - layer.configure(inputs_vector, &output, 3 - axis_); + if (input_count > 0) { + layer.configure(inputs_vector, &output, 3 - axis_); + } LOGS_DEFAULT(VERBOSE) << "axis: " << axis_; LOGS_DEFAULT(VERBOSE) << std::endl; @@ -83,7 +101,11 @@ Status Concat::Compute(OpKernelContext* ctx) const { for (int i = 0; i < input_count; i++) { auto X = input_tensors[i]; const T* x_data = X->Data(); +#ifdef ACL_2308 + arm_compute::Tensor* in = const_cast(static_cast(inputs_vector[i])); +#else arm_compute::Tensor* in = static_cast(inputs_vector[i]); +#endif if (X->Shape().Size() != 0 && in->info()->has_padding()) { in->allocator()->allocate(); @@ -101,7 +123,9 @@ Status Concat::Compute(OpKernelContext* ctx) const { ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4); } - layer.run(); + if (input_count > 0) { + layer.run(); + } if (Y->Shape().Size() != 0 && output.info()->has_padding()) { importDataFromTensor(&output, y_data); diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index 75f6f8d2eddd5..9cd0b3d0620af 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -989,6 +989,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN); #endif class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch); // !!PLEASE READ BELOW!! Following that, add new entries above this comment @@ -2448,6 +2449,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, #endif BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, }; diff --git a/onnxruntime/core/providers/cpu/text/string_concat.cc b/onnxruntime/core/providers/cpu/text/string_concat.cc new file mode 100644 index 0000000000000..bc626f8e055aa --- /dev/null +++ b/onnxruntime/core/providers/cpu/text/string_concat.cc @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "string_concat.h" +#include "core/providers/cpu/math/element_wise_ops.h" +#include "core/common/common.h" + +namespace onnxruntime { +ONNX_CPU_OPERATOR_KERNEL(StringConcat, 20, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + StringConcat); + +Status StringConcat::Compute(OpKernelContext* context) const { + ProcessBroadcastSpanFuncs broadcast_funcs{[](BroadcastHelper& broadcast_helper) { + auto x = broadcast_helper.ScalarInput0(); + auto y = broadcast_helper.SpanInput1(); + auto y_iter = y.begin(); + auto output_iter = broadcast_helper.OutputSpan().begin(); + const auto x_size = x.length(); + while (y_iter != y.end()) { + output_iter->reserve(x_size + y_iter->length()); + output_iter->append(x); + output_iter->append(*y_iter); + y_iter++; + output_iter++; + } + }, + [](BroadcastHelper& broadcast_helper) { + auto x = broadcast_helper.SpanInput0(); + auto x_iter = x.begin(); + auto y = broadcast_helper.ScalarInput1(); + auto output_iter = broadcast_helper.OutputSpan().begin(); + const auto y_size = y.length(); + while (x_iter != x.end()) { + output_iter->reserve(y_size + x_iter->length()); + output_iter->append(*x_iter); + output_iter->append(y); + x_iter++; + output_iter++; + } + }, + [](BroadcastHelper& broadcast_helper) { + auto x_iter = broadcast_helper.SpanInput0().begin(); + auto y_iter = broadcast_helper.SpanInput1().begin(); + auto output = broadcast_helper.OutputSpan(); + auto output_iter = output.begin(); + while (output_iter != output.end()) { + output_iter->reserve(x_iter->length() + y_iter->length()); + output_iter->append(*x_iter); + output_iter->append(*y_iter); + x_iter++; + y_iter++; + output_iter++; + } + }}; + UntypedBroadcastTwo(*context, broadcast_funcs); + return Status::OK(); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/text/string_concat.h b/onnxruntime/core/providers/cpu/text/string_concat.h new file mode 100644 index 0000000000000..63c1ea8a41146 --- /dev/null +++ b/onnxruntime/core/providers/cpu/text/string_concat.h @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/framework/op_kernel.h" + +namespace onnxruntime { + +class StringConcat final : public OpKernel { + public: + StringConcat(const OpKernelInfo& info) : OpKernel(info) {} + + Status Compute(OpKernelContext* context) const override; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp index adb4fd131119f..c6a15e76f4736 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp @@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder // The tensor description's size should be no larger than the constant input unless it was rounded to // the required alignment. assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes); - size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes); + size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast(tensorDesc->totalTensorSizeInBytes)); auto data = static_cast(constantInput->GetData()); std::vector tensorData(data, data + minimumConstantSize); diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h index 5c2d1f0b881ba..b850bea4bc275 100644 --- a/onnxruntime/core/providers/js/js_kernel.h +++ b/onnxruntime/core/providers/js/js_kernel.h @@ -67,6 +67,7 @@ namespace js { float value; \ ORT_ENFORCE(info.GetAttr(#attr_name, &value));, \ , ({#attr_name : $1}), static_cast(value)) +#define JSEP_HEAP_PTR(ptr) reinterpret_cast(ptr) // TODO: // class JsMultiProgramKernel : public OpKernel { /* TBD */ }; diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h index 5c0fbf93a4004..98a530c6b77f6 100644 --- a/onnxruntime/core/providers/js/operators/conv.h +++ b/onnxruntime/core/providers/js/operators/conv.h @@ -54,13 +54,13 @@ class ConvBase : public JsKernel { static_cast(conv_attrs_.group), static_cast(kernel_shape_0), static_cast(local_pads.size()), - reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, + JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), conv_attrs_.activation.c_str(), activation_params.size(), - reinterpret_cast(activation_params_ptr) >> 2); + JSEP_HEAP_PTR(activation_params_ptr) >> 2); } else { JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({ "format" : $11 ? "NHWC" : "NCHW", @@ -81,14 +81,14 @@ class ConvBase : public JsKernel { static_cast(kernel_shape_0), static_cast(kernel_shape_1), static_cast(local_pads.size()), - reinterpret_cast(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, + JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2, static_cast(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0), static_cast(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), conv_attrs_.activation.c_str(), activation_params.size(), - reinterpret_cast(activation_params_ptr) >> 2); + JSEP_HEAP_PTR(activation_params_ptr) >> 2); } } diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h index 5d30dc851e00f..353a946e95c21 100644 --- a/onnxruntime/core/providers/js/operators/conv_transpose.h +++ b/onnxruntime/core/providers/js/operators/conv_transpose.h @@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel { static_cast(pads_1), static_cast(strides), static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), gsl::narrow_cast(local_output_padding.size()), - reinterpret_cast(local_output_padding_ptr) >> 2, + JSEP_HEAP_PTR(local_output_padding_ptr) >> 2, gsl::narrow_cast(local_output_shape.size()), - reinterpret_cast(local_output_shape_ptr) >> 2, + JSEP_HEAP_PTR(local_output_shape_ptr) >> 2, conv_transpose_attrs_.activation.c_str()); } else { constexpr size_t pads_vec_size = 4; @@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel { "activation" : UTF8ToString($13) }), static_cast(conv_transpose_attrs_.auto_pad), - reinterpret_cast(local_dilations.data()) >> 2, + JSEP_HEAP_PTR(local_dilations.data()) >> 2, static_cast(conv_transpose_attrs_.group), - reinterpret_cast(local_kernel_shape.data()) >> 2, - reinterpret_cast(local_pads.data()) >> 2, - reinterpret_cast(local_strides.data()) >> 2, + JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2, + JSEP_HEAP_PTR(local_pads.data()) >> 2, + JSEP_HEAP_PTR(local_strides.data()) >> 2, static_cast(channels_last), - reinterpret_cast(&w_is_const_), + JSEP_HEAP_PTR(&w_is_const_), gsl::narrow_cast(local_output_padding.size()), - reinterpret_cast(local_output_padding_ptr) >> 2, + JSEP_HEAP_PTR(local_output_padding_ptr) >> 2, gsl::narrow_cast(local_output_shape.size()), - reinterpret_cast(local_output_shape_ptr) >> 2, + JSEP_HEAP_PTR(local_output_shape_ptr) >> 2, conv_transpose_attrs_.activation.c_str()); } } diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h index 19168f40b4722..bf808be949cf8 100644 --- a/onnxruntime/core/providers/js/operators/pad.h +++ b/onnxruntime/core/providers/js/operators/pad.h @@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase { static_cast(mode_), static_cast(value_), gsl::narrow_cast(pads.size()), - reinterpret_cast((pads.size() > 0) ? pads.data() : nullptr) >> 2); + JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h index a5a4aa834c2ca..95c4f2bec230d 100644 --- a/onnxruntime/core/providers/js/operators/reduce.h +++ b/onnxruntime/core/providers/js/operators/reduce.h @@ -8,29 +8,29 @@ namespace onnxruntime { namespace js { -#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \ - template \ - class ReduceKernel : public JsKernel, public ReduceKernelBase { \ - public: \ - using ReduceKernelBase::axes_; \ - using ReduceKernelBase::noop_with_empty_axes_; \ - using ReduceKernelBase::keepdims_; \ - ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \ - std::vector axes(axes_.size()); \ - if (axes_.size() > 0) { \ - std::transform(axes_.begin(), axes_.end(), axes.begin(), \ - [](int64_t axis) { return gsl::narrow_cast(axis); }); \ - } \ - JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \ - "keepDims" : !!$1, \ - "noopWithEmptyAxes" : !!$2, \ - "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \ - }), \ - static_cast(keepdims_), \ - static_cast(noop_with_empty_axes_), \ - gsl::narrow_cast(axes.size()), \ - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2); \ - } \ +#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel) \ + template \ + class ReduceKernel : public JsKernel, public ReduceKernelBase { \ + public: \ + using ReduceKernelBase::axes_; \ + using ReduceKernelBase::noop_with_empty_axes_; \ + using ReduceKernelBase::keepdims_; \ + ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase(info) { \ + std::vector axes(axes_.size()); \ + if (axes_.size() > 0) { \ + std::transform(axes_.begin(), axes_.end(), axes.begin(), \ + [](int64_t axis) { return gsl::narrow_cast(axis); }); \ + } \ + JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({ \ + "keepDims" : !!$1, \ + "noopWithEmptyAxes" : !!$2, \ + "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [], \ + }), \ + static_cast(keepdims_), \ + static_cast(noop_with_empty_axes_), \ + gsl::narrow_cast(axes.size()), \ + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2); \ + } \ }; JSEP_DEFINE_REDUCE_KERNEL(ReduceMax); diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h index 65854222ba988..4b1c288ae3015 100644 --- a/onnxruntime/core/providers/js/operators/resize.h +++ b/onnxruntime/core/providers/js/operators/resize.h @@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase { }), static_cast(antialias_), gsl::narrow_cast(axes.size()), - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2, + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2, resize_coordinate_transformation_mode.c_str(), static_cast(cubic_coeff_a_), static_cast(exclude_outside_), diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h index 6792997025d65..989adabf029a5 100644 --- a/onnxruntime/core/providers/js/operators/slice.h +++ b/onnxruntime/core/providers/js/operators/slice.h @@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase { "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [], "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}), gsl::narrow_cast(starts.size()), - reinterpret_cast((starts.size() > 0) ? starts.data() : nullptr) >> 2, + JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2, gsl::narrow_cast(ends.size()), - reinterpret_cast((ends.size() > 0) ? ends.data() : nullptr) >> 2, + JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2, gsl::narrow_cast(axes.size()), - reinterpret_cast((axes.size() > 0) ? axes.data() : nullptr) >> 2); + JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h index cfacc1aa6a363..1c1874e5aa98e 100644 --- a/onnxruntime/core/providers/js/operators/split.h +++ b/onnxruntime/core/providers/js/operators/split.h @@ -53,7 +53,7 @@ class Split : public JsKernel, public SplitBase { static_cast(axis_), static_cast(num_outputs_), gsl::narrow_cast(split_sizes.size()), - reinterpret_cast((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2); + JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h index 311badbde0d11..dae442b9f5a13 100644 --- a/onnxruntime/core/providers/js/operators/transpose.h +++ b/onnxruntime/core/providers/js/operators/transpose.h @@ -27,7 +27,7 @@ class Transpose final : public JsKernel, public TransposeBase { gsl::narrow_cast(perm_specified_ ? perm_.size() : 0), // $2: index to HEAP32 of the first int32 element. calculated from right shift memory // address by 2 - reinterpret_cast(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2); + JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2); } }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h index 35bd38d818979..b19d9ab0f66d0 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h @@ -56,6 +56,14 @@ struct TensorRTCustomOp : Ort::CustomOpBaseIsSubgraph()) { + const auto& current_initializers = cur_graph->GetAllInitializedTensors(); + all_initializers.insert(current_initializers.begin(), current_initializers.end()); + cur_graph = cur_graph->ParentGraph(); + } + // Collect initializers in top-level graph. + const auto& current_initializers = cur_graph->GetAllInitializedTensors(); + all_initializers.insert(current_initializers.begin(), current_initializers.end()); + } + + return all_initializers; +} + bool GetShape(const NodeArg& node_arg, std::vector& shape, const logging::Logger& logger) { const auto* shape_proto = node_arg.Shape(); if (!shape_proto) { diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index 8b8b85339a87c..ea57ab1af19af 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -35,6 +35,9 @@ typedef struct { bool isCpuSupported; // The WebNN CPU backend XNNPack supports it (not about the CPU EP). } WebnnOpInfo; +// Collects all the initializer tensors in the subGraph and its ancestor graphs. +InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer); + bool GetShape(const NodeArg& node_arg, std::vector& shape, const logging::Logger& logger); template @@ -139,7 +142,7 @@ static const InlinedHashMap op_map = { {"ArgMax", {"argMax", false}}, {"ArgMin", {"argMin", false}}, {"AveragePool", {"averagePool2d", true}}, - {"BatchNormalization", {"meanVarianceNormalization", false}}, + {"BatchNormalization", {"batchNormalization", false}}, {"Cast", {"cast", false}}, {"Ceil", {"ceil", true}}, {"Clip", {"clamp", true}}, @@ -162,12 +165,11 @@ static const InlinedHashMap op_map = { {"GlobalLpPool", {"l2Pool2d", false}}, {"Greater", {"greater", false}}, {"GreaterOrEqual", {"greaterOrEqual", false}}, - {"GroupNormalization", {"meanVarianceNormalization", false}}, {"HardSigmoid", {"hardSigmoid", false}}, {"HardSwish", {"hardSwish", true}}, {"Identity", {"identity", false}}, - {"InstanceNormalization", {"meanVarianceNormalization", false}}, - {"LayerNormalization", {"meanVarianceNormalization", false}}, + {"InstanceNormalization", {"instanceNormalization", false}}, + {"LayerNormalization", {"layerNormalization", false}}, {"LeakyRelu", {"leakyRelu", true}}, {"Less", {"lesser", false}}, {"LessOrEqual", {"lesserOrEqual", false}}, diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc index 756a838cc0c3e..4d2470dfe7deb 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc @@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder { const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; }; -// All normalization are based on layout NCHW. -// TODO: add support for NHWC. Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { @@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape."); } - std::vector new_scale_shape; - if (scale_size < rank) { - if (op_type == "BatchNormalization") { - scale_shape.insert(scale_shape.begin(), 1); - scale_shape.insert(scale_shape.end(), rank - 2, 1); - } else if (op_type == "LayerNormalization") { - // Align right with leading ones. - scale_shape.insert(scale_shape.begin(), rank - scale_size, 1); - } else if (op_type == "InstanceNormalization") { - // Insert ones before and after the channel dimension. - scale_shape.insert(scale_shape.begin(), 1); - ORT_RETURN_IF(scale_size != 1 || rank < 2, - "The scale size should be 1 and rank should be at least 2 for InstanceNorm."); - scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1); - } else if (op_type == "GroupNormalization") { - // The input will be reshaped to 3D later. So just insert ones before the channel and after. - scale_shape.insert(scale_shape.begin(), 1); - scale_shape.insert(scale_shape.end(), 1); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type); - } + emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name()); + options.set("scale", scale); - std::transform(scale_shape.cbegin(), scale_shape.cend(), - std::back_inserter(new_scale_shape), - [](int64_t dim) -> uint32_t { return SafeInt(dim); }); - emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name()); - emscripten::val reshape_output_scale = - model_builder.GetBuilder().call("reshape", reshape_scale, emscripten::val::array(new_scale_shape)); - options.set("scale", reshape_output_scale); - - if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { - // Bias input exists, and bias's shape is the same as scale's shape. - emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name()); - emscripten::val reshape_output_bias = - model_builder.GetBuilder().call("reshape", reshape_bias, emscripten::val::array(new_scale_shape)); - options.set("bias", reshape_output_bias); - } - } else { - options.set("scale", model_builder.GetOperand(input_defs[1]->Name())); - if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { - // Bias input exists, and bias's shape is the same as scale's shape. - options.set("bias", model_builder.GetOperand(input_defs[2]->Name())); - } + if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) { + // Bias input exists, and bias's shape is the same as scale's shape. + emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name()); + options.set("bias", bias); } NodeAttrHelper helper(node); @@ -114,56 +76,62 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs."); emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name()); emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name()); - // Enlarge 1-D mean and variance to new scale shape. - emscripten::val reshape_mean = - model_builder.GetBuilder().call("reshape", mean, emscripten::val::array(new_scale_shape)); - emscripten::val reshape_variance = - model_builder.GetBuilder().call("reshape", variance, emscripten::val::array(new_scale_shape)); - - std::vector axes = {0}; - for (uint32_t i = 2; i < rank; i++) { - axes.push_back(i); + if (model_builder.GetPreferredLayout() == DataLayout::NHWC) { + options.set("axis", rank - 1); } - - options.set("axes", emscripten::val::array(axes)); - options.set("mean", reshape_mean); - options.set("variance", reshape_variance); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); + output = model_builder.GetBuilder().call("batchNormalization", input, mean, variance, options); } else if (op_type == "LayerNormalization") { int64_t axis = helper.Get("axis", -1); axis = HandleNegativeAxis(axis, rank); std::vector axes(rank - SafeInt(axis)); - std::iota(axes.begin(), axes.end(), axis); + if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) { + std::iota(axes.begin(), axes.end(), axis - 1); + } else { + std::iota(axes.begin(), axes.end(), axis); + } options.set("axes", emscripten::val::array(axes)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); + output = model_builder.GetBuilder().call("layerNormalization", input, options); } else if (op_type == "InstanceNormalization") { - std::vector axes; - for (uint32_t i = 2; i < rank; i++) { - axes.emplace_back(i); + // WebNN spec only supports 4D input for instanceNormalization. + // Supports 3D input by prepending 1 size dimension. + // For models with dimensions greater than 4, they will be reshaped into 4D. + constexpr size_t webnn_shape_rank = 4; + if (input_shape.size() != webnn_shape_rank) { + std::vector new_shape; + new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank)); + std::transform(input_shape.begin(), input_shape.end(), + std::back_inserter(new_shape), + [](int64_t dim) -> uint32_t { return SafeInt(dim); }); + + size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3; + ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank; + auto insertion_point = new_shape.begin() + insertion_offset; + if (input_shape.size() < webnn_shape_rank) { + // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements. + new_shape.insert(insertion_point, -excess_rank, 1); + } else { + // Fold the extra range to fit within WebNN v1's rank requirements. + uint32_t sum = std::accumulate( + insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies()); + new_shape.erase(insertion_point, insertion_point + excess_rank); + *insertion_point = sum; + } + input = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); + } + + if (model_builder.GetPreferredLayout() == DataLayout::NHWC) { + options.set("layout", emscripten::val("nhwc")); + } + output = model_builder.GetBuilder().call("instanceNormalization", input, options); + // Reshape back to the original output shape for 3D input. + if (input_shape.size() != 4) { + std::vector output_shape; + std::transform(input_shape.begin(), input_shape.end(), + std::back_inserter(output_shape), + [](int64_t dim) -> uint32_t { return SafeInt(dim); }); + output = model_builder.GetBuilder().call( + "reshape", output, emscripten::val::array(output_shape)); } - options.set("axes", emscripten::val::array(axes)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", input, options); - } else if (op_type == "GroupNormalization") { - ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided."); - int32_t group_count = helper.Get("num_groups", -1); - std::vector orig_shape, new_shape; - std::transform(input_shape.cbegin(), input_shape.cend(), - std::back_inserter(orig_shape), - [](int64_t dim) -> uint32_t { return SafeInt(dim); }); - // Add N and Group. - ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D"); - new_shape.emplace_back(SafeInt(input_shape[0])); - new_shape.emplace_back(SafeInt(group_count)); - - ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0, - "GroupNormalization num_group must be divisible by group."); - new_shape.emplace_back(SafeInt(std::reduce(input_shape.begin() + 2, input_shape.end(), - input_shape[1] / group_count, std::multiplies()))); - // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization. - options.set("axes", emscripten::val::array(std::vector{2})); - output = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); - output = model_builder.GetBuilder().call("meanVarianceNormalization", output, options); - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(orig_shape)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type); } @@ -214,7 +182,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat constexpr static std::string_view op_types[] = { "BatchNormalization", - "GroupNormalization", "InstanceNormalization", "LayerNormalization", }; diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc index ea9fc379ee23f..186d1e7c1035a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc @@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder { // Operator support related. private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + const WebnnDeviceType device_type, const logging::Logger& logger) const override; // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing. // We only support Resize opset 11+ here. @@ -161,7 +161,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, + const WebnnDeviceType device_type, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); @@ -181,9 +181,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers const auto mode = helper.Get("mode", "nearest"); bool is_linear_resize = mode == "linear"; bool is_nearest_resize = mode == "nearest"; - if (!is_linear_resize && !is_nearest_resize) { - LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode; - return false; + // WebNN CPU backend only supports "linear" mode. + // WebNN GPU backend only supports "linear" and "nearest" modes. + if (device_type == WebnnDeviceType::CPU) { + if (!is_linear_resize) { + LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend."; + return false; + } + } else { + if (!is_linear_resize && !is_nearest_resize) { + LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend."; + return false; + } } const auto exclude_outside = helper.Get("exclude_outside", 0); diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc index b6631263dfb93..b57e1b89b0af0 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc @@ -38,6 +38,25 @@ Status ModelBuilder::Initialize() { return Status::OK(); } +InitializedTensorSet ModelBuilder::GetInitializerTensors() { + if (graph_viewer_.IsSubgraph()) { + auto all_initializers = CollectAllInitializedTensors(graph_viewer_); + const auto sub_graph_id = graph_viewer_.GetFilterInfo(); + const auto subgraph_initializer_names = sub_graph_id->GetMetaDef()->constant_initializers; + InitializedTensorSet subgraph_initializers; + + for (const auto& name : subgraph_initializer_names) { + auto it = all_initializers.find(name); + if (it != all_initializers.end()) { + subgraph_initializers.insert(*it); + } + } + return subgraph_initializers; + } else { + return graph_viewer_.GetAllInitializedTensors(); + } +} + /* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) { const auto& op_builders = GetOpBuilders(); const auto it = op_builders.find(node.OpType()); diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h index c381eef3f42f7..16c8bf2d3c77f 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.h +++ b/onnxruntime/core/providers/webnn/builders/model_builder.h @@ -30,7 +30,7 @@ class ModelBuilder { // Accessors for members. const GraphViewer& GetGraphViewer() const { return graph_viewer_; } - const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); } + InitializedTensorSet GetInitializerTensors(); const emscripten::val& GetBuilder() const { return wnn_builder_; } const emscripten::val& GetContext() const { return wnn_context_; } diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc index 463317a4dafda..613771eda71fe 100644 --- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc @@ -111,7 +111,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { { // Normalization CreateNormalizationOpBuilder("BatchNormalization", op_registrations); - CreateNormalizationOpBuilder("GroupNormalization", op_registrations); CreateNormalizationOpBuilder("InstanceNormalization", op_registrations); CreateNormalizationOpBuilder("LayerNormalization", op_registrations); } diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc index 4da54aaad3a33..cf18b3225eb47 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc @@ -59,10 +59,15 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view const IKernelLookup& /*kernel_registries*/) const { std::vector> result; - // We do not run WebNN EP on subgraph, instead we cover this in the control flow nodes. - // TODO investigate whether we want to support subgraph using WebNN EP. - if (graph_viewer.IsSubgraph()) { - return result; + // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its + // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for + // identifying the required initializer names and storing into 'meta_def->constant_initializers'. + // Thus we are able to get the required initialized tensors for this subgraph via the GetInitializerTensors() + // method defined in the model_builder.h file. + InitializedTensorSet all_initializers; + const bool is_subgraph = graph_viewer.IsSubgraph(); + if (is_subgraph) { + all_initializers = webnn::CollectAllInitializedTensors(graph_viewer); } /* @@ -110,6 +115,7 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view std::unique_ptr sub_graph = std::make_unique(); + std::vector subgraph_initializers; InlinedHashSet node_outputs; InlinedHashSet subgraph_inputs; InlinedHashSet subgraph_outputs; @@ -126,7 +132,11 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view // skip the placeholder inputs. continue; } - // if the node input was not produced by this subgraph, add it to the subgraph inputs. + // If it is a subgraph of a control flow node, collect the constant initializer. + if (is_subgraph && Contains(all_initializers, input->Name())) { + subgraph_initializers.push_back(input->Name()); + } + // If the node input was not produced by this subgraph, add it to the subgraph inputs. if (node_outputs.count(input) == 0) { if (subgraph_inputs.count(input) == 0) { subgraph_inputs.insert(input); @@ -165,6 +175,12 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view meta_def->since_version = 1; meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; + if (is_subgraph) { + for (const auto& initializer : subgraph_initializers) { + meta_def->constant_initializers.push_back(initializer); + } + } + for (const auto& input : ordered_subgraph_inputs) { meta_def->inputs.push_back(input->Name()); } diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index eea675eb0193a..984fdd6bce325 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -24,6 +24,7 @@ #include "core/session/custom_ops.h" #include "core/session/inference_session.h" #include "core/session/ort_apis.h" +#include "core/platform/threadpool.h" #if !defined(ORT_MINIMAL_BUILD) static constexpr uint32_t min_ort_version_with_optional_io_support = 8; @@ -380,6 +381,31 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont API_IMPL_END }; +ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) { + API_IMPL_BEGIN + if (!context) { + return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context"); + } + if (fn && total) { + const auto* ctx = reinterpret_cast(context); + auto* tp = ctx->GetOperatorThreadPool(); + if (num_batch) { + onnxruntime::concurrency::ThreadPool::TryBatchParallelFor( + tp, + static_cast(total), + [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast(ith)); }, + static_cast(num_batch)); + } else { + onnxruntime::concurrency::ThreadPool::TrySimpleParallelFor( + tp, + static_cast(total), + [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast(ith)); }); + } + } + return nullptr; + API_IMPL_END +}; + #ifdef _WIN32 #pragma warning(pop) #endif diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 76a8a778025e1..08bfb618f55b4 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2722,6 +2722,7 @@ static constexpr OrtApi ort_api_1_to_17 = { &OrtApis::SetSymbolicDimensions, &OrtApis::ReadOpAttr, &OrtApis::SetDeterministicCompute, + &OrtApis::KernelContext_ParallelFor, }; // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase. diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index c9e4074a1afe2..6df5e4145b416 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -502,4 +502,6 @@ ORT_API_STATUS_IMPL(SetSymbolicDimensions, _In_ OrtTensorTypeAndShapeInfo* info, ORT_API_STATUS_IMPL(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out); ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value); +ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data); + } // namespace OrtApis diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index 9f90196e301e5..3e9f9a6544a71 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -4,18 +4,23 @@ # license information. # -------------------------------------------------------------------------- +from __future__ import annotations + import argparse +import copy +import importlib import logging import os -from typing import List, Tuple import numpy as np import numpy.typing as npt import onnx from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto +from packaging import version from onnxruntime.capi._pybind_state import quantize_matmul_4bits +from .calibrate import CalibrationDataReader from .onnx_model import ONNXModel from .quant_utils import attribute_to_kwarg @@ -23,19 +28,101 @@ logger = logging.getLogger(__name__) +class WeightOnlyQuantConfig: + def __init__(self, algorithm): + """This is the Base class for Weight Only Quant Configuration. + + Args: + algorithm: + weight only quantize algorithm name. + """ + self.algorithm = algorithm + + +class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig): + def __init__( + self, + ratios=None, + ): + """ + This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration. + RTN is the most straightforward way to quantize weight using scale maps. + + Args: + ratios: + percentile of clip. Defaults to {}. + """ + if ratios is None: + ratios = {} + super().__init__( + algorithm="RTN", + ) + self.ratios = ratios + + +class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig): + def __init__( + self, + calibration_data_reader: CalibrationDataReader, + percdamp=0.01, + blocksize=128, + actorder=False, + mse=False, + perchannel=True, + ): + """ + This is a class for GPTQ algorithm Weight Only Quant Configuration. + GPTQ algorithm provides more accurate quantization but requires more computational resources. + + Args: + calibration_data_reader: + a calibration data reader. It enumerates calibration data and generates inputs for the original model. + percdamp: + percent of the average Hessian diagonal to use for dampening. + blocksize (int, optional): + channel number in one block to execute a GPTQ quantization iteration. + actorder (bool, optional): + whether rearrange Hessian matrix considering the diag's value. + mse (bool, optional): + whether get scale and zero point with mse error. + perchannel (bool, optional): + whether quantize weight per-channel. + """ + super().__init__( + algorithm="GPTQ", + ) + self.calibration_data_reader = calibration_data_reader + self.percdamp = percdamp + self.blocksize = blocksize + self.actorder = actorder + self.mse = mse + self.perchannel = perchannel + + class MatMul4BitsQuantizer: """Perform 4b quantization of constant MatMul weights""" - def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None): + def __init__( + self, + model: ModelProto | str, + block_size: int, + is_symmetric: bool, + accuracy_level: int | None = None, + nodes_to_exclude=None, + algo_config: WeightOnlyQuantConfig = None, + ): if nodes_to_exclude is None: nodes_to_exclude = [] - self.model = ONNXModel(model) + self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model) + self.model_path = model if isinstance(model, str) else None self.block_size = block_size self.is_symmetric = is_symmetric + self.accuracy_level = accuracy_level self.nodes_to_exclude = set(nodes_to_exclude) + self.algo_config = algo_config @staticmethod - def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]: + def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]: for gid in range(len(graph_path) - 1, -1, -1): graph = graph_path[gid] for tensor in graph.initializer: @@ -66,7 +153,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray: return (packed, scales, zero_point) - def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto: + def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto: """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node""" if node.op_type != "MatMul": @@ -113,6 +200,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) kwargs["N"] = cols kwargs["bits"] = 4 kwargs["block_size"] = self.block_size + if self.accuracy_level is not None: + kwargs["accuracy_level"] = self.accuracy_level matmul_q4_node = onnx.helper.make_node( "MatMulNBits", @@ -127,7 +216,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) return matmul_q4_node - def _process_subgraph(self, graph_stack: List[GraphProto]): + def _process_subgraph(self, graph_stack: list[GraphProto]): new_nodes = [] graph = graph_stack[-1] @@ -165,20 +254,99 @@ def _process_subgraph(self, graph_stack: List[GraphProto]): graph_stack.pop() return graph - def process(self): - # use a stack to keep track of sub-graphs - graph_stack = [self.model.graph()] - opset_import = self.model.opset_import() + def _generate_q4_node_config(self): + """Generate weight only quant configuration for nodes.""" + q4_node_config = {} + template_config_q4 = { + "bits": 4, + "group_size": self.block_size, + "scheme": "sym" if self.is_symmetric else "asym", + } + for node in self.model.model.graph.node: + if node.op_type in ["MatMul"]: + if not all([self.model.get_initializer(i) is None for i in node.input]): + q4_node_config[node.name] = template_config_q4 + return q4_node_config + + def int4_quant_algo(self): + """4b quantize a model with RTN or GPTQ algorithm. Please refer to + https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md + for more details on weight only quantization using Intel® Neural Compressor. + """ + + def inc_dataloader(): + data_reader = copy.deepcopy(self.algo_config.calibration_data_reader) + for data in data_reader: + yield data, None - has_ms_domain = False - for opset in opset_import: - if opset.domain == "com.microsoft": - has_ms_domain = True - if not has_ms_domain: - opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)]) + kwargs = {} + if self.accuracy_level is not None: + kwargs["accuracy_level"] = self.accuracy_level + weight_only_node_config = self._generate_q4_node_config() + + algorithm = self.algo_config.algorithm + logger.info(f"start to quantize model with {algorithm} algorithm...") + if algorithm == "RTN": + from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize + + kwargs["ratios"] = self.algo_config.ratios + + self.model = rtn_quantize( + model=self.model_path if self.model_path is not None else self.model.model, + weight_config=weight_only_node_config, + **kwargs, + ) + elif algorithm == "GPTQ": + from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize + + kwargs["percdamp"] = self.algo_config.percdamp + kwargs["blocksize"] = self.algo_config.blocksize + kwargs["actorder"] = self.algo_config.actorder + kwargs["mse"] = self.algo_config.mse + kwargs["perchannel"] = self.algo_config.perchannel + kwargs["n_samples"] = -1 + dataloader = inc_dataloader() + + self.model = gptq_quantize( + model=self.model_path if self.model_path is not None else self.model.model, + weight_config=weight_only_node_config, + dataloader=dataloader, + **kwargs, + ) + logger.info(f"complete quantization of model with {algorithm} algorithm.") - self._process_subgraph(graph_stack) - self.model.clean_initializers() + def process(self): + if self.algo_config is None: + # use a stack to keep track of sub-graphs + graph_stack = [self.model.graph()] + opset_import = self.model.opset_import() + + has_ms_domain = False + for opset in opset_import: + if opset.domain == "com.microsoft": + has_ms_domain = True + if not has_ms_domain: + opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)]) + + self._process_subgraph(graph_stack) + self.model.clean_initializers() + else: + # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm + try: + importlib.import_module("neural_compressor") + except Exception as e: + logging.error(f"{e}.") + raise RuntimeError( + "neural-compressor is not correctly installed. Please check your environment." + ) from e + + import neural_compressor + + assert version.parse(neural_compressor.__version__) >= version.parse( + "2.3.2" + ), "Require neural-compressor >= 2.3.2 to support weight only quantization!" + + self.int4_quant_algo() def parse_args(): @@ -201,6 +369,14 @@ def parse_args(): type=bool, help="Indicate whether to quantize the model symmetrically", ) + parser.add_argument( + "--accuracy_level", + required=False, + type=int, + help="Accuracy level of the 4-bit quantized MatMul computation. " + "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details " + "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).", + ) parser.add_argument("-v", "--verbose", required=False, action="store_true") parser.set_defaults(verbose=False) parser.add_argument( @@ -228,6 +404,12 @@ def parse_args(): raise Exception(f"file {output_model_path} already exists") model = onnx.load(input_model_path) - quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude) + quant = MatMul4BitsQuantizer( + model=model, + block_size=args.block_size, + is_symmetric=args.symmetric, + accuracy_level=args.accuracy_level, + nodes_to_exclude=args.nodes_to_exclude, + ) quant.process() quant.model.save_model_to_file(output_model_path, True) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index aed46563c2764..1bd2ef42151d0 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -466,7 +466,6 @@ def quantize_static( import copy - import onnx from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant def inc_dataloader(): @@ -478,13 +477,11 @@ def inc_dataloader(): dataloader = inc_dataloader() sq = ORTSmoothQuant(model_input, dataloader, reduce_range) del dataloader - model = sq.transform( - extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True) - ).model - nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes]) + model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)) sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.") - model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix() - onnx.save_model(model, model_input, save_as_external_data=True) + model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix() + model.save(model_input) + nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes]) model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir: diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index e694b5050cc8c..bc09b52574a27 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -1,9 +1,10 @@ +from __future__ import annotations + import argparse import logging import os import shutil from itertools import chain -from typing import List import onnx import torch @@ -21,11 +22,12 @@ from onnxruntime import quantization as ort_quantization from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer +torch_export_onnx_opset_version = 14 logger = logging.getLogger("") init_dist() -def get_model_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_model_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in input_names: @@ -42,7 +44,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]): return dynamic_axes -def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in {"input_ids", "position_ids"}: @@ -65,7 +67,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li return dynamic_axes -def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]): +def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]): dynamic_axes = {} for name in input_names + output_names: if name in {"input_ids", "position_ids"}: @@ -229,7 +231,7 @@ def run_torchscript_separate_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -288,7 +290,7 @@ def run_torchscript_separate_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -368,7 +370,7 @@ def run_torchscript_merged_export( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=13, + opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, ) @@ -412,7 +414,7 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov def convert_to_float16( - args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1 + args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1 ): decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx") decoder_with_past_model_fp16_path = os.path.join( @@ -635,7 +637,7 @@ def get_args(): help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.", ) - blockwise_group = parser.add_argument_group("4-bit quantization") + blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)") blockwise_group.add_argument( "--block_size", @@ -645,6 +647,15 @@ def get_args(): help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.", ) + blockwise_group.add_argument( + "--int4_accuracy_level", + required=False, + type=int, + help="Accuracy level of the 4-bit quantized MatMul computation. " + "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details " + "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).", + ) + smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)") smooth_quant_group.add_argument( @@ -937,7 +948,13 @@ def main(): for fp_path, int4_path in zip(old_paths, new_paths): if os.path.exists(fp_path): model = onnx.load_model(fp_path, load_external_data=True) - quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[]) + quant = MatMul4BitsQuantizer( + model=model, + block_size=args.block_size, + is_symmetric=True, + accuracy_level=args.int4_accuracy_level, + nodes_to_exclude=[], + ) quant.process() quant.model.save_model_to_file(int4_path, use_external_data_format=True) del model diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py index bae1ae82e8f7e..a329b73259dda 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from __future__ import annotations import numpy as np import torch @@ -235,7 +235,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u # Convert list of past_key_values to dict of past_key and past_value -def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]): +def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]): past_kv = {} for i, (past_k, past_v) in enumerate(past_key_values): past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy() diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py index 418a65325c8f0..25d7519769604 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py @@ -1,8 +1,9 @@ +from __future__ import annotations + import argparse import logging import os import time -from typing import List import numpy as np import torch @@ -139,7 +140,7 @@ def verify_parity( return kv_cache_ortvalues -def get_args(argv: List[str]): +def get_args(argv: list[str]): parser = argparse.ArgumentParser() parser.add_argument( @@ -232,7 +233,7 @@ def get_args(argv: List[str]): return args -def main(argv: List[str] = []): # noqa: B006 +def main(argv: list[str] = []): # noqa: B006 args = get_args(argv) setup_logger(args.verbose) logger.info(f"Arguments: {args}") diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc index d9c870a7dc52a..6afb61bd1f0a1 100644 --- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc @@ -738,10 +738,23 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) { tester.AddOutput("present", past_dims, present); - // Run - std::vector> execution_providers; - execution_providers.push_back(DefaultCudaExecutionProvider()); - tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + // Run - Regular kernel execution path + { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + + // Test alternate kernel path of loading more KV data "in flight" + { + ScopedEnvironmentVariables scoped_env_vars{ + EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}}; + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } } } } @@ -852,10 +865,22 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) { tester.AddOutput("present", past_dims, present); - // Run - std::vector> execution_providers; - execution_providers.push_back(DefaultCudaExecutionProvider()); - tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + // Run - Regular kernel execution path + { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + + // Test alternate kernel path of loading more KV data "in flight" + { + ScopedEnvironmentVariables scoped_env_vars{ + EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}}; + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } } } } diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 486ec37d1eebd..2522ee3b496f6 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -578,6 +578,9 @@ TEST(InferenceSessionTests, ModelMetadata) { } #endif TEST(InferenceSessionTests, CheckRunLogger) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } SessionOptions so; so.session_logid = "CheckRunLogger"; @@ -837,6 +840,9 @@ TEST(InferenceSessionTests, PreAllocateOutputVector) { } TEST(InferenceSessionTests, ConfigureVerbosityLevel) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } SessionOptions so; so.session_logid = "ConfigureVerbosityLevel"; @@ -2661,6 +2667,9 @@ class InferenceSessionTestSharingAllocator : public InferenceSessionWrapper { // Ensure sessions use the same allocator. It uses ORT created allocator. TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllocator) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } auto logging_manager = std::make_unique( std::unique_ptr(new CLogSink()), logging::Severity::kVERBOSE, false, LoggingManager::InstanceType::Temporal); @@ -2706,6 +2715,9 @@ TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllo // Ensure sessions don't use the same allocator. It uses ORT created allocator. TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsDontUseSameOrtCreatedAllocator) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } auto logging_manager = std::make_unique( std::unique_ptr(new CLogSink()), logging::Severity::kVERBOSE, false, LoggingManager::InstanceType::Temporal); @@ -2758,6 +2770,9 @@ class InferenceSessionTestSharingInitializer : public InferenceSessionWrapper { }; TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitializer) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } auto logging_manager = std::make_unique( std::unique_ptr(new CLogSink()), logging::Severity::kVERBOSE, false, LoggingManager::InstanceType::Temporal); @@ -2942,6 +2957,9 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) { // test inter thread pool with setting denormal as zero TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } // test if denormal-as-zero mode is supported if (!SetDenormalAsZero(false)) { return; diff --git a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc index 7a67747f7cf4c..89ffb8ec87dcb 100644 --- a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc +++ b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc @@ -234,4 +234,44 @@ TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodes) { EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 4, OpCount(op_count_after, "DequantizeLinear")); } +TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodesPreservingAttributes) { + constexpr auto model_uri = ORT_TSTR("testdata/qdq_with_multi_consumer_q_dq_axis.onnx"); + + SessionOptions session_options{}; + // test interaction with level 1 transformers + session_options.graph_optimization_level = TransformerLevel::Level1; + + InferenceSessionWrapper session{session_options, GetEnvironment()}; + + ASSERT_STATUS_OK(session.Load(model_uri)); + + const auto op_count_before = CountOpsInGraph(session.GetGraph()); + + ASSERT_STATUS_OK(session.Initialize()); + + const auto op_count_after = CountOpsInGraph(session.GetGraph()); + + EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 8, OpCount(op_count_after, "DequantizeLinear")); + + int64_t given_axis = 0; // all the following 4 DQ nodes and their duplicated one should have axis = 0 + std::string axis_dq_name0 = "Convolution28_Output_0/fusedmuladd_B/DequantizeLinear"; + std::string axis_dq_name1 = "Parameter5/DequantizeLinear"; + std::string axis_dq_name2 = "Convolution110_Output_0/fusedmuladd_B/DequantizeLinear"; + std::string axis_dq_name3 = "Parameter87/DequantizeLinear"; + for (const auto& node : session.GetGraph().Nodes()) { + if (node.OpType() == "DequantizeLinear") { + if (node.Name().find(axis_dq_name0) == 0 || + node.Name().find(axis_dq_name1) == 0 || + node.Name().find(axis_dq_name2) == 0 || + node.Name().find(axis_dq_name3) == 0) { + const auto& attrs = node.GetAttributes(); + ASSERT_TRUE(attrs.find("axis") != attrs.end()); + const auto& axis_attr = attrs.at("axis"); + int64_t axis = axis_attr.i(); + EXPECT_EQ(axis, given_axis); + } + } + } +} + } // namespace onnxruntime::test diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index ac25c98b15758..13082fe69cf48 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -170,6 +170,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device const auto& api = Ort::GetApi(); OrtTensorRTProviderOptionsV2* tensorrt_options; Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options)); + std::unique_ptr rel_trt_options( + tensorrt_options, api.ReleaseTensorRTProviderOptions); std::vector option_keys, option_values; // used to keep all option keys and value strings alive std::list buffer; diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index 7ec9e0f345187..ddb0a6620619c 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -588,6 +588,9 @@ TEST_F(ActivationOpTest, Softplus) { } TEST_F(ActivationOpNoInfTest, Softsign) { + if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { + GTEST_SKIP() << "Skipping the test"; + } // TODO: Unskip when fixed #41968513 if (DefaultDmlExecutionProvider().get() != nullptr) { GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1, which exceeds threshold"; diff --git a/onnxruntime/test/providers/cpu/text/string_concat_test.cc b/onnxruntime/test/providers/cpu/text/string_concat_test.cc new file mode 100644 index 0000000000000..2bfa3dc5615e1 --- /dev/null +++ b/onnxruntime/test/providers/cpu/text/string_concat_test.cc @@ -0,0 +1,76 @@ +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +namespace onnxruntime { +namespace test { + +static void RunTest(const std::vector& dims, const std::vector& input1, + const std::vector& input2, const std::vector& output) { + OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain); + test.AddInput("X", dims, input1); + test.AddInput("Y", dims, input2); + test.AddOutput("Z", dims, output); + test.Run(); +} + +TEST(StringConcat, BasicConcatenation) { + RunTest({1, 2}, {"Hello", "World"}, {"Hello", "World"}, {"HelloHello", "WorldWorld"}); +} + +TEST(StringConcat, TwoDimensionalConcatenation) { + RunTest({2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}, {"Hello", "World", "ONNX", "onnxruntime"}, + {"HelloHello", "WorldWorld", "ONNXONNX", "onnxruntimeonnxruntime"}); +} + +TEST(StringConcat, LeftToRightBroadcastingConcatenation) { + OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain); + test.AddInput("X", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}); + test.AddInput("Y", {1}, {"!"}); + test.AddOutput("Z", {2, 2}, {"Hello!", "World!", "ONNX!", "onnxruntime!"}); + test.Run(); +} + +TEST(StringConcat, RightToLeftBroadcastingConcatenation) { + OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain); + test.AddInput("X", {1}, {"!"}); + test.AddInput("Y", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}); + test.AddOutput("Z", {2, 2}, {"!Hello", "!World", "!ONNX", "!onnxruntime"}); + test.Run(); +} + +TEST(StringConcat, BidirectionalBroadcastingConcatenation) { + OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain); + test.AddInput("X", {2, 1, 3}, {"a", "b", "c", "d", "e", "f"}); + test.AddInput("Y", {1, 4, 3}, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m"}); + test.AddOutput("Z", {2, 4, 3}, + { + "aa", + "bb", + "cc", + "ad", + "be", + "cf", + "ag", + "bh", + "ci", + "ak", + "bl", + "cm", + "da", + "eb", + "fc", + "dd", + "ee", + "ff", + "dg", + "eh", + "fi", + "dk", + "el", + "fm", + }); + test.Run(); +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py index 02f51cc4fa809..73dae08af8ece 100644 --- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py +++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py @@ -71,13 +71,16 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non output_name = "output" initializers = [] - def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str): + def make_matmul( + input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str + ): weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) return onnx.helper.make_node( "MatMul", [input_name, weight_name], [output_name], + node_name, ) in_features = 52 @@ -88,6 +91,7 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na [in_features, out_features], "linear1.weight", output_name, + "MatMul_0", ) # make graph @@ -139,6 +143,48 @@ def quant_test( else: raise exception + def quant_test_with_algo( + self, + algorithm: str, + model_fp32_path: str, + data_reader: TestDataFeeds, + block_size: int, + is_symmetric: bool, + ): + model_int4_path = str( + Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute() + ) + + # Quantize fp32 model to int4 model + from onnxruntime.quantization import matmul_4bits_quantizer + + algo_config = None + if algorithm == "RTN": + # test RTN algorithm + algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig() + elif algorithm == "GPTQ": + # test GPTQ algorithm + algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader) + + model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path)) + quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config) + quant.process() + quant.model.save_model_to_file(model_int4_path, False) + + quant_nodes = {"MatMulNBits": 1} + check_op_type_count(self, model_int4_path, **quant_nodes) + + data_reader.rewind() + + try: + check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next()) + except Exception as exception: + if "4b quantization not yet supported on this hardware platform!" in exception.args[0]: + # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception + pass + else: + raise exception + @unittest.skipIf( find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" ) @@ -159,6 +205,28 @@ def test_quantize_matmul_int4_offsets(self): data_reader = self.input_feeds(1, {"input": [100, 52]}) self.quant_test(model_fp32_path, data_reader, 32, False) + @unittest.skipIf( + find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" + ) + def test_quantize_matmul_int4_using_rtn_algo(self): + if not find_spec("neural_compressor"): + self.skipTest("skip test_smooth_quant since neural_compressor is not installed") + model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute()) + self.construct_model_matmul(model_fp32_path, symmetric=False) + data_reader = self.input_feeds(1, {"input": [100, 52]}) + self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False) + + @unittest.skipIf( + find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" + ) + def test_quantize_matmul_int4_using_gptq_algo(self): + if not find_spec("neural_compressor"): + self.skipTest("skip test_smooth_quant since neural_compressor is not installed") + model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute()) + self.construct_model_matmul(model_fp32_path, symmetric=False) + data_reader = self.input_feeds(1, {"input": [100, 52]}) + self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False) + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc index 85edfa0e59f1d..ebef441350d4c 100644 --- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc +++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc @@ -49,16 +49,45 @@ struct KernelOne { } }; +struct DataI { + const float* from = {}; + float* to = {}; +}; + +struct DataII { + const float* from = {}; + int32_t* to = {}; +}; + +// floats to floats +void CopyI(void* raw_data, size_t ith) { + auto data = reinterpret_cast(raw_data); + data->to[ith] = data->from[ith]; +} + +// floats to int32_t +void CopyII(void* raw_data, size_t ith) { + auto data = reinterpret_cast(raw_data); + data->to[ith] = static_cast(round(data->from[ith])); +} + // lite custom op as a function -void KernelTwo(const Ort::Custom::Tensor& X, +void KernelTwo(OrtKernelContext* context, + const Ort::Custom::Tensor& X, Ort::Custom::Tensor& Y) { const auto& shape = X.Shape(); auto X_raw = X.Data(); auto Y_raw = Y.Allocate(shape); + std::vector floats(static_cast(X.NumberOfElement()), 0.f); + + DataI data_i = {X_raw, floats.data()}; auto total = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies()); - for (int64_t i = 0; i < total; i++) { - Y_raw[i] = static_cast(round(X_raw[i])); - } + + Ort::KernelContext ctx(context); + ctx.ParallelFor(CopyI, static_cast(total), 0, &data_i); // test simple parallel for + + DataII data_ii = {floats.data(), Y_raw}; + ctx.ParallelFor(CopyII, static_cast(total), 2, &data_ii); // test batch parallel for } template diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx index 8116ec3380645..3a43a7378a912 100644 Binary files a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx and b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx differ diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 3db497fa92315..c2ca5f860a107 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -248,11 +248,6 @@ "^test_image_decoder_decode_pnm_rgb", "^test_image_decoder_decode_tiff_rgb", "^test_image_decoder_decode_webp_rgb", - "^test_string_concat_broadcasting", - "^test_string_concat", - "^test_string_concat_empty_string", - "^test_string_concat_utf8", - "^test_string_concat_zero_dimensional", "^test_string_split_basic", "^test_string_split_consecutive_delimiters", "^test_string_split_empty_string_delimiter", diff --git a/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx new file mode 100644 index 0000000000000..4f575ebb2841a Binary files /dev/null and b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx differ diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc index 9b77832abb6f1..3fbdd5da7b768 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc @@ -485,12 +485,15 @@ void ListAllCombinations(const InlinedVector> new_combination = current_combination; - new_combination.push_back(plan); - ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations); - } + const InlinedVector>>& + plan_combination_list_at_cur_index = all_possible_node_optimization_plans[index]; + // For the index-th reused buffer, iterate all possible complete plans. + for (size_t i = 0; i < plan_combination_list_at_cur_index.size(); ++i) { + const auto& plan_combination = plan_combination_list_at_cur_index[i]; + InlinedVector> new_combination = current_combination; + // Append the chosen complete plan and continue exploring the next reused buffer by index + 1. + new_combination.insert(new_combination.end(), plan_combination.begin(), plan_combination.end()); + ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations); } MO_LOG_DEBUG_INFO(logger, "Exit ListAllCombinations"); @@ -520,17 +523,28 @@ void IterateNodeOptimizationPlan(const std::shared_ptr } InlinedVector>>> - all_possible_node_optimization_plans; - all_possible_node_optimization_plans.resize(plan->reuse_buffers.size()); + all_possible_node_optimization_plans(plan->reuse_buffers.size()); size_t i = 0; for (const auto& p : plan->reuse_buffers) { MO_LOG_DEBUG_INFO(logger, ">>>reuse buffer: " + std::to_string(p.first)); - IterateNode(p.second.first, node_to_optimization_plans_map, {}, logger, all_possible_node_optimization_plans[i]); + // If the resued node is part of current node optimization plan, then we just add current combination to the result. + if (plan->GetOptimizationType() == OptimizationType::RecomputeWithCompromise || plan->GetOptimizationType() == OptimizationType::Recompute) { + const auto& recompute_subgraph = + dynamic_cast(plan.get())->GetNodesInTopoOrder(); + if (std::find(recompute_subgraph.begin(), recompute_subgraph.end(), p.second.first) != recompute_subgraph.end()) { + all_possible_node_optimization_plans[i].push_back(current_combination); + } + } + + if (all_possible_node_optimization_plans[i].size() == 0) { + IterateNode(p.second.first, node_to_optimization_plans_map, current_combination, logger, all_possible_node_optimization_plans[i]); + } + ++i; } - ListAllCombinations(all_possible_node_optimization_plans, 0, current_combination, logger, all_combinations); + ListAllCombinations(all_possible_node_optimization_plans, 0, {}, logger, all_combinations); MO_LOG_DEBUG_INFO(logger, "Exit IterateNodeOptimizationPlan: " + plan->GetClusterId()); } diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc index 64e99a4a0bca5..4ce896c5350b0 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc @@ -15,35 +15,6 @@ namespace onnxruntime::optimizer::memory_optimizer { -std::string NodeOptimizationPlanBase::GetMemorySavingSymbolicString() const { - std::string saving_str; - for (auto output_index : activation_output_indices_) { - // If the output is reusing other node's buffer, then no memory saving. - if (reuse_buffers.find(output_index) != reuse_buffers.end()) { - continue; - } - - const auto& output_def = node->OutputDefs()[output_index]; - MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto()); - ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ", - DataTypeImpl::ToString(ml_data_type)); - const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType(); - ORT_ENFORCE(nullptr != tensor_type_base); - MLDataType elt_type = tensor_type_base->GetElementType(); - const auto byte_count_per_element = elt_type->Size(); - if (!saving_str.empty()) { - saving_str += " + "; - } - saving_str = "(" + GetActivationOutputDimParamString(output_index) + " * " + - std::to_string(byte_count_per_element) + " * " + - std::to_string(GetSaveRatio()) + ")"; - } - if (saving_str.empty()) { - return saving_str; - } - return "(" + saving_str + ")"; -} - Status MemoryOptimizationPlanner::UpdateNodePlansFromExecutionPlan(const GraphViewer& graph_viewer, const OrtValueNameIdxMap& ortvalue_name_to_idx_map, const SequentialExecutionPlan& p_seq_exec_plan) { diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h index c585b2810b39d..789f530b29f1d 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h @@ -83,7 +83,7 @@ class NodeOptimizationPlanBase { /** * Get a symbolic string to represent the memory saving for this optimization plan. */ - std::string GetMemorySavingSymbolicString() const; + virtual std::string GetMemorySavingSymbolicString() const = 0; std::string GetActivationOutputDimParamString(size_t index) const { ORT_ENFORCE(activation_output_dim_params_.find(index) != activation_output_dim_params_.end(), diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc index 52dea571a1eaf..12c83591c0036 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc @@ -72,12 +72,14 @@ const InlinedHashMap& GetAllowedRecompu {"Add", AllowedRecomputeNodeConfig{{0, 1}}}, {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}}, {"Div", AllowedRecomputeNodeConfig{{0, 1}}}, + {"Equal", AllowedRecomputeNodeConfig{{0, 1}}}, {"Mul", AllowedRecomputeNodeConfig{{0, 1}}}, {"Sub", AllowedRecomputeNodeConfig{{0, 1}}}, // Data layout /// The shape input is trivial whether it exists or not in backward. {"Reshape", AllowedRecomputeNodeConfig{{0}}}, + {"Shape", AllowedRecomputeNodeConfig{{0}}}, {"Squeeze", AllowedRecomputeNodeConfig{{0}}}, {"Transpose", AllowedRecomputeNodeConfig{{0}}}, {"Unsqueeze", AllowedRecomputeNodeConfig{{0}}}, @@ -92,6 +94,7 @@ const InlinedHashMap& GetAllowedRecompu {"Expand", AllowedRecomputeNodeConfig{{0}}}, {"FastGelu", AllowedRecomputeNodeConfig{{0}}}, {"Gelu", AllowedRecomputeNodeConfig{{0}}}, + {"QuickGelu", AllowedRecomputeNodeConfig{{0}}}, // Ternary elementwise {"Where", AllowedRecomputeNodeConfig{{0, 1, 2}}}, diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h index d9693835313b8..ab114d970191e 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h @@ -86,6 +86,51 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase { std::string GetNodesInTopoOrderStr() const; + std::string GetMemorySavingSymbolicString() const override { + std::string saving_str; + for (auto output_index : GetActivationOutputIndices()) { + // If the output is reusing other node's buffer, then no memory saving. + std::string cur_output_saving_str; + + bool is_reused = reuse_buffers.find(output_index) != reuse_buffers.end(); + bool is_src_node_in_cur_node_subgraph = false; + if (is_reused) { + // Here we assume the src_node is the real owner of the buffer, so we don't need trace further. + const auto* src_node = reuse_buffers.at(output_index).first; + is_src_node_in_cur_node_subgraph = std::find(nodes_in_topological_order_.begin(), + nodes_in_topological_order_.end(), + src_node) != nodes_in_topological_order_.end(); + } + + if (!is_reused || is_src_node_in_cur_node_subgraph) { + // For is_src_node_in_cur_node_subgraph is True, still use the output to calculate the saving, because + // reusing buffer is the same size. + const auto& output_def = node->OutputDefs()[output_index]; + MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto()); + ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ", + DataTypeImpl::ToString(ml_data_type)); + const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType(); + ORT_ENFORCE(nullptr != tensor_type_base); + MLDataType elt_type = tensor_type_base->GetElementType(); + const auto byte_count_per_element = elt_type->Size(); + cur_output_saving_str = GetActivationOutputDimParamString(output_index) + " * " + + std::to_string(byte_count_per_element) + " * " + + std::to_string(GetSaveRatio()); + } else { + cur_output_saving_str = "0"; + } + + if (!saving_str.empty()) { + saving_str += " + "; + } + + saving_str = "(" + cur_output_saving_str + ")"; + } + + ORT_ENFORCE(!saving_str.empty(), "saving_str should not be empty for node: ", node->OpType(), " ", node->Name()); + return "(" + saving_str + ")"; + } + private: bool compromise_recompute_; InlinedVector nodes_in_topological_order_; diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 76943b954837b..853eab61b4bd6 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -243,7 +243,7 @@ def _get_session_config(self): # requires PRIORITY_BASED order to work properly. So we use PRIORITY_BASED order when recompute is enabled. session_options.execution_order = ( onnxruntime.ExecutionOrder.PRIORITY_BASED - if self._runtime_options.memory_optimizer_config != "" + if self._runtime_options.memory_optimizer_is_enabled() else onnxruntime.ExecutionOrder.DEFAULT ) # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2. diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py index a93f6413b7ab4..bfa38efb349ae 100644 --- a/orttraining/orttraining/python/training/ortmodule/options.py +++ b/orttraining/orttraining/python/training/ortmodule/options.py @@ -399,3 +399,12 @@ def _override_from_env_vars(self): if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ: self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1 + + def memory_optimizer_is_enabled(self) -> bool: + """Check whether memory optimizer is enabled.""" + if self.memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED: + return len(self.memory_optimizer_config) > 0 + elif self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE: + return True + + return False diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 6fb42dd59b6a0..feca94ae27c13 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -2218,12 +2218,6 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2_GPU) { {3072, 768}, {4096, 768}, {8192, 768}, - {16384, 768}, - {32768, 768}, - {65536, 768}, - {131072, 768}, - {250002, 768}, - {500004, 768}, }; for (const auto& test_dim : test_dims) { diff --git a/setup.py b/setup.py index 685f0612e3762..e94165fdf9b05 100644 --- a/setup.py +++ b/setup.py @@ -451,6 +451,7 @@ def finalize_options(self): "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Operating System :: Microsoft :: Windows", "Operating System :: MacOS", ] diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index c655100fbf475..3d0ec92a7bd23 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -606,7 +606,7 @@ def convert_arg_line_to_args(self, arg_line): "--use_acl", nargs="?", const="ACL_1905", - choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"], + choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2308"], help="Build with ACL for ARM architectures.", ) parser.add_argument("--acl_home", help="Path to ACL home dir") @@ -1031,6 +1031,7 @@ def generate_build_tree( "-Donnxruntime_USE_ACL_1905=" + ("ON" if args.use_acl == "ACL_1905" else "OFF"), "-Donnxruntime_USE_ACL_1908=" + ("ON" if args.use_acl == "ACL_1908" else "OFF"), "-Donnxruntime_USE_ACL_2002=" + ("ON" if args.use_acl == "ACL_2002" else "OFF"), + "-Donnxruntime_USE_ACL_2308=" + ("ON" if args.use_acl == "ACL_2308" else "OFF"), "-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"), "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"), "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"), diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index badee79fd78b3..172a0dc1866ab 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -92,6 +92,9 @@ stages: vmImage: ubuntu-latest steps: - checkout: none + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() - bash: | # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. set +x @@ -105,6 +108,10 @@ stages: echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]" fi name: Set_Release_Version_Suffix + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' + - stage: Debug dependsOn: Setup @@ -116,7 +123,14 @@ stages: MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']] steps: - checkout: none + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() - bash: echo $(MyVar) + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' + - stage: Download_Java_Tools dependsOn: [] @@ -126,6 +140,9 @@ stages: vmImage: ubuntu-latest steps: - checkout: none + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() - task: CmdLine@2 displayName: Download Java Tools inputs: @@ -141,6 +158,9 @@ stages: inputs: targetPath: '$(Agent.TempDirectory)/java-tools' artifact: 'onnxruntime-java-tools' + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' - template: templates/c-api-cpu.yml parameters: @@ -525,6 +545,9 @@ stages: submodules: false - checkout: manylinux # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux submodules: false + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() - script: | set -e -x @@ -603,6 +626,10 @@ stages: inputs: targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz' artifactName: 'onnxruntime-linux-x64-gpu' + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' + - stage: Windows_Packaging_combined_GPU dependsOn: @@ -619,6 +646,10 @@ stages: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples submodules: false + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + - script: dir $(Build.SourcesDirectory) - task: BatchScript@1 displayName: 'setup env' @@ -688,7 +719,9 @@ stages: inputs: artifactName: 'onnxruntime-win-x64-gpu' targetPath: '$(Build.ArtifactStagingDirectory)' - + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' - stage: NuGet_Packaging_GPU dependsOn: @@ -1246,45 +1279,21 @@ stages: mkdir $(Build.ArtifactStagingDirectory)\testdata copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata -- template: nuget/templates/dml-vs-2022.yml - parameters: - AgentPool : 'onnxruntime-Win-CPU-2022' - IsReleaseBuild: ${{ parameters.IsReleaseBuild }} - ArtifactName: 'drop-win-dml-arm-zip' - StageName: 'Windows_CI_GPU_DML_Dev_arm' - BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" - BuildArch: 'x64' - EnvSetupScript: 'setup_env.bat' - sln_platform: 'arm' - DoDebugBuild: 'false' - DoNugetPack : 'true' - DoCompliance: ${{ parameters.DoCompliance }} - DoEsrp: ${{ parameters.DoEsrp }} - RunTests: 'false' - BuildNodejs: 'false' - NuPackScript: | - msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} - cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\ - ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm.zip - copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm.zip $(Build.ArtifactStagingDirectory) - mkdir $(Build.ArtifactStagingDirectory)\testdata - copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata - - stage: NuGet_Packaging_DML dependsOn: - Windows_CI_GPU_DML_Dev - Windows_CI_GPU_DML_Dev_x86 - Windows_CI_GPU_DML_Dev_arm64 - - Windows_CI_GPU_DML_Dev_arm condition: succeeded() jobs: - job: workspace: clean: all - pool: 'onnxruntime-Win2022-GPU-T4' - + pool: 'onnxruntime-Win2022-GPU-dml-A10' steps: - + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() - task: DownloadPipelineArtifact@0 displayName: 'Download Pipeline Artifact - NuGet DirectML' inputs: @@ -1303,12 +1312,6 @@ stages: artifactName: 'drop-win-dml-arm64-zip' targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml' - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - NuGet DirectML arm' - inputs: - artifactName: 'drop-win-dml-arm-zip' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml' - - script: | pushd $(Build.BinariesDirectory)\nuget-artifact-dml dir @@ -1339,13 +1342,6 @@ stages: move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb - unzip win-dml-arm.zip -d win-arm - mkdir %%~ni\runtimes\win-arm - mkdir %%~ni\runtimes\win-arm\native - - move win-arm\runtimes\win-arm\native\onnxruntime.dll %%~ni\runtimes\win-arm\native\onnxruntime.dll - move win-arm\runtimes\win-arm\native\onnxruntime.lib %%~ni\runtimes\win-arm\native\onnxruntime.lib - move win-arm\runtimes\win-arm\native\onnxruntime.pdb %%~ni\runtimes\win-arm\native\onnxruntime.pdb pushd %%~ni zip -r ..\%%~ni.zip . @@ -1368,7 +1364,7 @@ stages: PackageType: 'nuget' PackagePath: '$(Build.ArtifactStagingDirectory)' PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg' - PlatformsSupported: 'win-x64,win-x86,win-arm64,win-arm' + PlatformsSupported: 'win-x64,win-x86,win-arm64' VerifyNugetSigning: ${{ parameters.DoEsrp }} - task: PublishPipelineArtifact@0 @@ -1376,3 +1372,6 @@ stages: inputs: artifactName: 'drop-signed-nuget-dml' targetPath: '$(Build.ArtifactStagingDirectory)' + - template: templates/component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml index 9755e1f0771ba..07b233590bcf5 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml @@ -13,7 +13,7 @@ stages: jobs: - job: Linux_Training_CPU_Wheels - timeoutInMinutes: 120 + timeoutInMinutes: 180 workspace: clean: all pool: onnxruntime-Ubuntu2004-AMD-CPU @@ -28,6 +28,8 @@ stages: PythonVersion: '3.10' Python311: PythonVersion: '3.11' + Python312: + PythonVersion: '3.12' steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index d9aff36c4ad34..f6fcbd08ff03a 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -9,11 +9,6 @@ parameters: type: string default: qnn-v2.17.0.231124_win -- name: ort_package_version - displayName: OnnxRuntime Nuget package version - type: string - default: 1.15.0 - - name: build_config displayName: Build Configuration type: string @@ -47,7 +42,7 @@ jobs: buildArch: x64 setVcvars: true ALLOW_RELEASED_ONNX_OPSET_ONLY: '1' - commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' + commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' steps: - template: templates/set-version-number-variables-step.yml @@ -90,7 +85,7 @@ jobs: displayName: 'Generating nuspec for the native Nuget package x64' inputs: script: | - python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} + python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} nuget pack NativeNuget.nuspec mkdir $(Build.ArtifactStagingDirectory)\x64 @@ -130,7 +125,7 @@ jobs: displayName: 'Generate CMake Configuration for arm64' inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' + arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}' - task: VSBuild@1 displayName: 'Build onnxruntime arm64' @@ -178,7 +173,7 @@ jobs: displayName: 'Generating nuspec for the native Nuget package arm64' inputs: script: | - python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} + python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }} cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} nuget pack NativeNuget.nuspec mkdir $(Build.ArtifactStagingDirectory)\arm64 diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index f3d68957d649c..e6d8ee35e75e3 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -92,6 +92,14 @@ stages: EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} + - template: ../templates/py-win-gpu.yml + parameters: + MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' + PYTHON_VERSION: '3.12' + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_NAME: gpu + CudaVersion: ${{ parameters.cuda_version }} + - ${{ if eq(parameters.enable_linux_gpu, true) }}: - template: ../templates/py-linux-gpu.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 37b4bdc43afcd..e6025ae1b56bd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -161,20 +161,6 @@ stages: buildJava: false buildNodejs: false -- template: win-ci.yml - parameters: - DoCompliance: ${{ parameters.DoCompliance }} - DoEsrp: ${{ parameters.DoEsrp }} - stage_name_suffix: CPU_arm_${{ parameters.BuildVariant }} - buildArch: x64 - msbuildPlatform: arm - packageName: arm - buildparameter: --arm ${{ parameters.AdditionalBuildFlags }} ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe - runTests: false - buildJava: false - buildNodejs: false - ort_build_pool_name: onnxruntime-Win-CPU-2022 - - template: win-ci.yml parameters: DoCompliance: ${{ parameters.DoCompliance }} @@ -205,10 +191,7 @@ stages: dependsOn: - Linux_C_API_Packaging_CPU - MacOS_C_API_Package_Publish - - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }} - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }} - - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }} - - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }} - Download_Java_Tools condition: succeeded() jobs: @@ -297,7 +280,6 @@ stages: - MacOS_C_API_Package_Publish - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }} - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }} - - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }} - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }} - Android_Java_API_AAR_Packaging_Full - iOS_Full_xcframework @@ -340,14 +322,6 @@ stages: SpecificArtifact: ${{ parameters.specificArtifact }} BuildId: ${{ parameters.BuildId }} - - template: flex-downloadPipelineArtifact.yml - parameters: - StepName: 'Download win-arm Pipeline Artifact' - ArtifactName: 'onnxruntime-win-arm' - TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' - SpecificArtifact: ${{ parameters.specificArtifact }} - BuildId: ${{ parameters.BuildId }} - - template: flex-downloadPipelineArtifact.yml parameters: StepName: 'Download osx-x64 Pipeline Artifact' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml index db3782c69cf62..2adcbb13dbeb8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml @@ -106,3 +106,7 @@ jobs: inputs: artifactName: 'drop-linux-cpu-${{ parameters.arch }}' targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}' + + - template: component-governance-component-detection-steps.yml + parameters : + condition : 'succeeded' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml index 8d5ca19a73535..0cb438c71066e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml @@ -30,6 +30,8 @@ jobs: PythonVersion: '3.10' Python311: PythonVersion: '3.11' + Python312: + PythonVersion: '3.12' steps: - checkout: none diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 44904f9248b10..a3c2983b755d0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -77,6 +77,10 @@ stages: PythonVersion: '3.11' MsbuildPlatform: x64 buildArch: x64 + Python312_x64: + PythonVersion: '3.12' + MsbuildPlatform: x64 + buildArch: x64 # Training build cannot support Win32 for now because one or more of its python # dependencies does not support Win32. So, don't build a training package for Win32 ${{ if not(contains(parameters.build_py_parameters, '--enable_training')) }}: @@ -96,13 +100,17 @@ stages: PythonVersion: '3.11' MsbuildPlatform: Win32 buildArch: x86 + Python312_x86: + PythonVersion: '3.12' + MsbuildPlatform: Win32 + buildArch: x86 variables: OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)' EnvSetupScript: setup_env.bat setVcvars: true BuildConfig: 'RelWithDebInfo' ExtraParam: ${{ parameters.build_py_parameters }} - timeoutInMinutes: 120 + timeoutInMinutes: 180 workspace: clean: all @@ -295,6 +303,14 @@ stages: ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu + - template: py-win-gpu.yml + parameters: + MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' + PYTHON_VERSION: '3.12' + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + ENV_SETUP_SCRIPT: setup_env_gpu.bat + EP_NAME: gpu + - template: py-win-gpu.yml parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10' @@ -327,9 +343,17 @@ stages: ENV_SETUP_SCRIPT: setup_env.bat EP_NAME: directml + - template: py-win-gpu.yml + parameters: + MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10' + PYTHON_VERSION: '3.12' + EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos + ENV_SETUP_SCRIPT: setup_env.bat + EP_NAME: directml + - ${{ if eq(parameters.enable_mac_cpu, true) }}: - job: MacOS_py_Wheels - timeoutInMinutes: 120 + timeoutInMinutes: 180 workspace: clean: all pool: @@ -346,6 +370,8 @@ stages: PythonVersion: '3.10' Python311: PythonVersion: '3.11' + Python312: + PythonVersion: '3.12' steps: - checkout: self clean: true diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml index 7fdd7e54e752d..e7b935712ac6c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml @@ -98,6 +98,12 @@ stages: OpsetVersion: ${{ parameters.opset_version }} CudaVersion: ${{ parameters.cuda_version }} UploadWheel: ${{ parameters.upload_wheel }} + Python312: + PythonVersion: '3.12' + TorchVersion: ${{ parameters.torch_version }} + OpsetVersion: ${{ parameters.opset_version }} + CudaVersion: ${{ parameters.cuda_version }} + UploadWheel: ${{ parameters.upload_wheel }} steps: - task: CmdLine@2 diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml index 110eaff46f460..1fe58a7239369 100644 --- a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml +++ b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml @@ -30,6 +30,10 @@ steps: variables = { "PythonManylinuxDir": "/opt/python/cp311-cp311" } + elif version == "3.12": + variables = { + "PythonManylinuxDir": "/opt/python/cp312-cp312" + } else: raise ValueError("Unsupported Python version: '{}'".format(version)) diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index fdb9238071c9e..eee38ac04b355 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -59,15 +59,14 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_cuda.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 + additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: CUDA WITH_CACHE: true - # Some unit tests crash on A10 GPUs. So this job still needs to use T4. - MachinePool: onnxruntime-Win2022-GPU-T4 + MachinePool: onnxruntime-Win2022-GPU-A10 isTraining: true - stage: dml diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 3c1c65c9a6862..4c0a39fdc512e 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -9,7 +9,7 @@ EXTRA_ARG="" # Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this # config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests. -PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp38-cp38/bin/python3.8") +PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8") while getopts "d:p:x:c:" parameter_Option do case "${parameter_Option}" in diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index af87852561e0a..546fca69201a1 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -116,6 +116,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/install-pypy.sh \ build_scripts/pypy.sha256 \ @@ -127,6 +131,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh @@ -140,6 +145,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda index 8f265b208cd47..0c95083d614ed 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda @@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/install-pypy.sh \ build_scripts/pypy.sha256 \ @@ -131,6 +135,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh FROM runtime_base @@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm index b9fd88083f218..dd7c669c37885 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm @@ -135,6 +135,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/install-pypy.sh \ @@ -147,6 +151,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh @@ -160,6 +165,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 index 09ab7951552a0..a6a75afb0f4c3 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 @@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/install-pypy.sh \ build_scripts/pypy.sha256 \ @@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh @@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 index a36f60b87768d..d29157daef611 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 @@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/install-pypy.sh \ build_scripts/pypy.sha256 \ @@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh @@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu index 06e75ee1a39f6..66fe0cafd945b 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu @@ -114,6 +114,10 @@ FROM build_cpython AS build_cpython311 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2 +FROM build_cpython AS build_cpython312 +COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt +RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1 + FROM build_cpython AS all_python COPY build_scripts/finalize-python.sh \ /build_scripts/ @@ -122,6 +126,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/ COPY --from=build_cpython39 /opt/_internal /opt/_internal/ COPY --from=build_cpython310 /opt/_internal /opt/_internal/ COPY --from=build_cpython311 /opt/_internal /opt/_internal/ +COPY --from=build_cpython312 /opt/_internal /opt/_internal/ RUN manylinux-entrypoint /build_scripts/finalize-python.sh @@ -135,6 +140,7 @@ COPY build_scripts/finalize.sh \ build_scripts/requirements3.9.txt \ build_scripts/requirements3.10.txt \ build_scripts/requirements3.11.txt \ + build_scripts/requirements3.12.txt \ build_scripts/requirements-base-tools.txt \ /build_scripts/ COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/ diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh index 7bf031ee78485..f576b867da73b 100755 --- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -x pushd . -PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11") +PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12") CURRENT_DIR=$(pwd) if ! [ -x "$(command -v protoc)" ]; then $CURRENT_DIR/install_protobuf.sh diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt index aa0ad05b42dbf..7249fd2331321 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt @@ -1,5 +1,6 @@ numpy==1.21.6 ; python_version < '3.11' -numpy==1.24.2 ; python_version >= '3.11' +numpy==1.24.2 ; python_version == '3.11' +numpy==1.26.0 ; python_version >= '3.12' mypy pytest setuptools>=68.2.2 diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile index 85d738d2167e1..6c71631368822 100644 --- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile @@ -65,9 +65,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 conda update --all && \ rm ~/miniconda.sh && conda clean -ya -# Conda base patch -RUN pip install cryptography==41.0.4 - # Create migraphx-ci environment ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci ENV CONDA_DEFAULT_ENV migraphx-ci diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh index 86585b75d43fe..1ac1d226deec6 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh @@ -46,6 +46,8 @@ elif [[ "$PYTHON_VER" = "3.10" && -d "/opt/python/cp310-cp310" ]]; then PYTHON_EXE="/opt/python/cp310-cp310/bin/python3.10" elif [[ "$PYTHON_VER" = "3.11" && -d "/opt/python/cp311-cp311" ]]; then PYTHON_EXE="/opt/python/cp311-cp311/bin/python3.11" +elif [[ "$PYTHON_VER" = "3.12" && -d "/opt/python/cp312-cp312" ]]; then + PYTHON_EXE="/opt/python/cp312-cp312/bin/python3.12" else PYTHON_EXE="/usr/bin/python${PYTHON_VER}" fi diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh index 8c79918120d8d..5b181a484a607 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh @@ -19,7 +19,7 @@ PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)" source "$PARENT_DIR/install_dotnet.sh" if [ ! -d "/opt/conda/bin" ]; then - PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11") + PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12") else PYTHON_EXES=("/opt/conda/bin/python") fi diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh index ad3366b0bb3b6..d8d2fbc06a00b 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh @@ -6,7 +6,7 @@ yum -y install \ graphviz if [ ! -d "/opt/conda/bin" ]; then - PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11") + PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12") else PYTHON_EXES=("/opt/conda/bin/python") fi diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index d6912bfb05efe..94f52f476579b 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -1,5 +1,6 @@ numpy==1.21.6 ; python_version < '3.11' -numpy==1.24.2 ; python_version >= '3.11' +numpy==1.24.2 ; python_version == '3.11' +numpy==1.26.0 ; python_version >= '3.12' mypy pytest setuptools>=68.2.2 diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 0fc80b30c1b3a..58a342277fc2d 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -1,6 +1,7 @@ cerberus numpy==1.21.6 ; python_version < '3.11' -numpy==1.24.2 ; python_version >= '3.11' +numpy==1.24.2 ; python_version == '3.11' +numpy==1.26.0 ; python_version >= '3.12' mypy pytest setuptools==69.0.3 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt index 9c52aff960d6e..57331d6df97d9 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt @@ -1,2 +1,3 @@ numpy==1.21.6 ; python_version < '3.11' -numpy==1.24.2 ; python_version >= '3.11' \ No newline at end of file +numpy==1.24.2 ; python_version == '3.11' +numpy==1.26.0 ; python_version >= '3.12' \ No newline at end of file diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt index 0cd5e5c5d5c46..01fa7b0ff956e 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt @@ -1,5 +1,5 @@ scikit-learn packaging==21.3 -transformers==v4.30.0 -accelerate==0.20.1 +transformers==v4.36.0 +accelerate==0.25.0 wget diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt index b4b265f65b69f..47f64568f424a 100644 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt @@ -1,9 +1,10 @@ pandas scikit-learn numpy==1.21.6 ; python_version < '3.11' -numpy==1.24.2 ; python_version >= '3.11' -transformers==v4.30.0 -accelerate +numpy==1.24.2 ; python_version == '3.11' +numpy==1.26.0 ; python_version >= '3.12' +transformers==v4.36.0 +accelerate==0.25.0 rsa==4.9 tensorboard==2.13.0 h5py diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index 29048b79d4b81..4db9df80ed187 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -67,9 +67,6 @@ ENV CONDA_DEFAULT_ENV rocm-ci RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9 ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH} -# Conda base patch -RUN pip install cryptography==41.0.4 - # Enable rocm-ci environment SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"] diff --git a/tools/ci_build/requirements.txt b/tools/ci_build/requirements.txt index aaca45b3e17e1..57fc8f08336d2 100644 --- a/tools/ci_build/requirements.txt +++ b/tools/ci_build/requirements.txt @@ -1,7 +1,8 @@ # packages used by transformers python unittest (only enabled in Linux CPU CI Pipeline) packaging protobuf==3.20.2 -numpy==1.24.0 +numpy==1.24.0 ; python_version < '3.12' +numpy==1.26.0 ; python_version >= '3.12' coloredlogs==15.0 transformers==4.36.0 psutil