diff --git a/.config/1espt/PipelineAutobaseliningConfig.yml b/.config/1espt/PipelineAutobaseliningConfig.yml
new file mode 100644
index 0000000000000..183d52d5c1d44
--- /dev/null
+++ b/.config/1espt/PipelineAutobaseliningConfig.yml
@@ -0,0 +1,77 @@
+## DO NOT MODIFY THIS FILE MANUALLY. This is part of auto-baselining from 1ES Pipeline Templates. Go to [https://aka.ms/1espt-autobaselining] for more details.
+
+pipelines:
+ 1624:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-10-25
+ policheck:
+ lastModifiedDate: 2024-10-25
+ eslint:
+ lastModifiedDate: 2024-10-25
+ psscriptanalyzer:
+ lastModifiedDate: 2024-10-25
+ armory:
+ lastModifiedDate: 2024-10-25
+ usedNonDefaultBranch: true
+ 1299:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-10-25
+ eslint:
+ lastModifiedDate: 2024-10-25
+ psscriptanalyzer:
+ lastModifiedDate: 2024-10-25
+ armory:
+ lastModifiedDate: 2024-10-25
+ policheck:
+ lastModifiedDate: 2024-10-29
+ binary:
+ credscan:
+ lastModifiedDate: 2024-10-25
+ binskim:
+ lastModifiedDate: 2024-10-25
+ spotbugs:
+ lastModifiedDate: 2024-10-25
+ 1625:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-11-05
+ policheck:
+ lastModifiedDate: 2024-11-05
+ eslint:
+ lastModifiedDate: 2024-11-05
+ psscriptanalyzer:
+ lastModifiedDate: 2024-11-05
+ armory:
+ lastModifiedDate: 2024-11-05
+ binary:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ binskim:
+ lastModifiedDate: 2024-11-13
+ spotbugs:
+ lastModifiedDate: 2024-11-13
+ 1626:
+ retail:
+ source:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ policheck:
+ lastModifiedDate: 2024-11-13
+ eslint:
+ lastModifiedDate: 2024-11-13
+ psscriptanalyzer:
+ lastModifiedDate: 2024-11-13
+ armory:
+ lastModifiedDate: 2024-11-13
+ binary:
+ credscan:
+ lastModifiedDate: 2024-11-13
+ binskim:
+ lastModifiedDate: 2024-11-13
+ spotbugs:
+ lastModifiedDate: 2024-11-13
diff --git a/.config/guardian/.gdnbaselines b/.config/guardian/.gdnbaselines
new file mode 100644
index 0000000000000..a7ee2a4b69dda
--- /dev/null
+++ b/.config/guardian/.gdnbaselines
@@ -0,0 +1,43 @@
+{
+ "properties": {
+ "helpUri": "https://eng.ms/docs/microsoft-security/security/azure-security/cloudai-security-fundamentals-engineering/security-integration/guardian-wiki/microsoft-guardian/general/baselines"
+ },
+ "version": "1.0.0",
+ "baselines": {
+ "default": {
+ "name": "default",
+ "createdDate": "2024-11-13 00:40:35Z",
+ "lastUpdatedDate": "2024-11-13 00:40:35Z"
+ }
+ },
+ "results": {
+ "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68": {
+ "signature": "48f03e2797fc40ecea50f878a0268947c7e13db1b2fa51aa3981246844fc4c68",
+ "alternativeSignatures": [],
+ "target": "ScanTelemetry_20241113003616898.json",
+ "line": 1,
+ "memberOf": [
+ "default"
+ ],
+ "tool": "credscan",
+ "ruleId": "CSCAN-AZURE0130",
+ "createdDate": "2024-11-13 00:40:35Z",
+ "expirationDate": "2025-05-02 01:29:47Z",
+ "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 01:29:47Z"
+ },
+ "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1": {
+ "signature": "9cb6eddb3f3e886ad06cae65f5886412ff0c5fb0b96d4e943e4efa237be617b1",
+ "alternativeSignatures": [],
+ "target": "ScanTelemetry_20241113111547065.json",
+ "line": 1,
+ "memberOf": [
+ "default"
+ ],
+ "tool": "credscan",
+ "ruleId": "CSCAN-AZURE0130",
+ "createdDate": "2024-11-13 11:20:17Z",
+ "expirationDate": "2025-05-02 11:55:15Z",
+ "justification": "This error is baselined with an expiration date of 180 days from 2024-11-13 11:55:15Z"
+ }
+ }
+}
\ No newline at end of file
diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml
new file mode 100644
index 0000000000000..6a76f7bcdbcb0
--- /dev/null
+++ b/.github/codeql/codeql-config.yml
@@ -0,0 +1,7 @@
+name: "CodeQL config"
+queries:
+ - uses: security-extended
+ - uses: security-and-quality
+paths-ignore:
+ - tests
+ - build
\ No newline at end of file
diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 7144363717749..0cbaf24059390 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -8,7 +8,7 @@ on:
jobs:
validate:
name: "validate"
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- name: Check out a copy of the repository
uses: actions/checkout@v4
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index e4d1b91bab736..d1dc717c2a9c9 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -15,10 +15,14 @@ on:
schedule:
- cron: '41 13 * * 0'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
jobs:
analyze:
name: Analyze
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
permissions:
actions: read
contents: read
@@ -55,6 +59,11 @@ jobs:
java-version: '11'
distribution: 'microsoft'
+ - if: ${{ matrix.language == 'javascript' }}
+ uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- if: ${{ matrix.language != 'cpp' }}
diff --git a/.github/workflows/gradle-wrapper-validation.yml b/.github/workflows/gradle-wrapper-validation.yml
index 32aed81092774..cf3bc598d02bb 100644
--- a/.github/workflows/gradle-wrapper-validation.yml
+++ b/.github/workflows/gradle-wrapper-validation.yml
@@ -8,7 +8,7 @@ on: [push, pull_request]
jobs:
validation:
name: "Validation"
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
- uses: gradle/actions/wrapper-validation@v4
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index a196226a4b836..00960c848b107 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -8,7 +8,7 @@ permissions:
jobs:
triage:
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: github/issue-labeler@v3.4
with:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 2edbe2d814533..8d966d358de01 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -7,6 +7,10 @@ on:
- rel-*
pull_request:
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+ cancel-in-progress: true
+
jobs:
optional-lint:
name: Optional Lint
@@ -32,23 +36,29 @@ jobs:
lint-python-format:
# Required workflow
name: Python format
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ permissions:
+ contents: read
+ security-events: write
steps:
- uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
- # Version range or exact version of Python to use, using SemVer's version range syntax. Reads from .python-version if unset.
+ # Use the version configured in target-version of [tool.black] section in pyproject.toml.
python-version: "3.10"
- name: Setup Rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
components: rustfmt
+ - name: Update PATH
+ run: |
+ echo "$HOME/.local/bin" >> "$GITHUB_PATH"
- name: Install dependencies
run: |
- python -m pip install -r requirements-dev.txt
- python -m pip install lintrunner lintrunner-adapters
+ set -e -x
+ python -m pip install --user -r requirements-dev.txt
lintrunner init
- name: Run lintrunner on all files
run: |
@@ -77,8 +87,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
+ - name: Update PATH
+ run: |
+ echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
- name: Install ninja
- run: python -m pip install --upgrade ninja
+ run: python -m pip install --user --upgrade ninja
- name: Generate compile_commands.json
run: |
python tools/ci_build/build.py \
@@ -110,9 +124,12 @@ jobs:
lint-js:
name: Lint JavaScript
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
+ - uses: actions/setup-node@v4
+ with:
+ node-version: 20
- uses: reviewdog/action-eslint@v1
with:
reporter: github-pr-check
diff --git a/.github/workflows/linux_training.yml b/.github/workflows/linux_training.yml
new file mode 100644
index 0000000000000..d382cdf476283
--- /dev/null
+++ b/.github/workflows/linux_training.yml
@@ -0,0 +1,55 @@
+name: orttraining-linux-ci-pipeline
+on:
+ push:
+ branches:
+ - main
+ - rel-*
+ pull_request:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ orttraining-linux-ci-pipeline:
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ permissions:
+ actions: read
+ contents: read
+ security-events: write
+ steps:
+ - uses: actions/checkout@v4
+ - run: |
+ python3 -m pip install --user -r tools/ci_build/github/linux/python/requirements.txt
+ - name: Initialize CodeQL
+ uses: github/codeql-action/init@v3
+ with:
+ config-file: ./.github/codeql/codeql-config.yml
+ languages: 'cpp'
+ - run: |
+ set -e -x
+ rm -rf build
+ python3 tools/ci_build/build.py --build_dir build --config Release --enable_training --skip_submodule_sync --parallel --update --build
+
+ - name: Perform CodeQL Analysis
+ uses: github/codeql-action/analyze@v3
+ with:
+ category: "/language:cpp"
+ output: sarif-results
+ upload: failure-only
+
+ - name: filter-sarif
+ uses: advanced-security/filter-sarif@v1
+ with:
+ patterns: |
+ +**/*.cc
+ +**/*.h
+ -tests/**/*.*
+ -build/**/*.*
+ input: sarif-results/cpp.sarif
+ output: sarif-results/cpp.sarif
+
+ - name: Upload SARIF
+ uses: github/codeql-action/upload-sarif@v3
+ with:
+ sarif_file: sarif-results/cpp.sarif
\ No newline at end of file
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index d1a4366da45e2..b36b0aa555940 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -20,7 +20,7 @@ env:
jobs:
ARM64-Xcode16:
- runs-on: macos-14
+ runs-on: macos-15
env:
xcode_version: 16
@@ -60,12 +60,16 @@ jobs:
--use_xnnpack \
--use_binskim_compliant_compile_flags
- ARM64-Xcode16-targeting-iphonesimulator-x86_64:
- runs-on: macos-14
+ ARM64-Xcode16-targeting-iphonesimulator:
+ runs-on: macos-15
env:
xcode_version: 16
+ strategy:
+ matrix:
+ target_arch: [x86_64, arm64]
+
timeout-minutes: 60
steps:
@@ -87,16 +91,14 @@ jobs:
- uses: actions/checkout@v4
- # Note: Setting onnxruntime_BUILD_UNIT_TESTS=OFF as a workaround for
- # https://github.com/microsoft/onnxruntime/issues/22245.
- - name: Build
+ - name: Build for iphonesimulator ${{ matrix.target_arch }}
shell: bash
run: |
python ./tools/ci_build/build.py \
--build_dir ./build \
--update \
--build --parallel \
- --skip_tests \
+ --test \
--build_apple_framework \
--use_xcode \
--use_coreml \
@@ -105,8 +107,7 @@ jobs:
--ios \
--apple_deploy_target=13.0 \
--apple_sysroot=iphonesimulator \
- --osx_arch=x86_64 \
- --cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF
+ --osx_arch=${{ matrix.target_arch }}
Vcpkg:
runs-on: macos-13
diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml
new file mode 100644
index 0000000000000..af890d88995be
--- /dev/null
+++ b/.github/workflows/pr_checks.yml
@@ -0,0 +1,52 @@
+# Copyright (c) ONNX Project Contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: PR Checks
+
+on:
+ pull_request:
+ branches:
+ - main
+
+permissions: # set top-level default permissions as security best practice
+ contents: read
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+ cancel-in-progress: true
+
+jobs:
+ auto-apply-fixes:
+ name: Suggest fixes
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ permissions:
+ contents: read
+ pull-requests: write
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+ - name: Setup Rust
+ uses: actions-rs/toolchain@v1
+ with:
+ toolchain: stable
+ components: rustfmt
+
+ - name: Update PATH
+ run: |
+ echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+ - name: Install dependencies and run lintrunner on all files
+ run: |
+ python -m pip install --user -r requirements-dev.txt
+ python -m pip install --user lintrunner lintrunner-adapters
+ lintrunner init
+ set +e
+ lintrunner f --all-files -v
+ exit 0
+ - uses: parkerbxyz/suggest-changes@v2
+ with:
+ comment: 'You can commit the suggested changes from lintrunner.'
diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 6c4dc43847d0b..6d3e593d8694e 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -9,7 +9,7 @@ on:
- include/onnxruntime/core/session/**
- orttraining/orttraining/training_api/include/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
@@ -22,7 +22,7 @@ permissions:
jobs:
build:
name: Generate C/C++ API docs
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
- name: Install doxygen and dependencies
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 862a7a70e33a2..7cca0969a168b 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -8,7 +8,7 @@ on:
paths:
- csharp/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
@@ -20,18 +20,17 @@ permissions:
jobs:
build:
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
env:
DOCFXVERSION: 2.62.2
steps:
- uses: actions/checkout@v4
- - name: Setup .NET
- uses: actions/setup-dotnet@v4
- with:
- dotnet-version: 8.0.x
- name: Install DocFX
run: |
dotnet tool update -g docfx
+ - name: Update PATH
+ run: |
+ Add-Content -Value "$env:USERPROFILE\.dotnet\tools" -Encoding utf8 -Path $env:GITHUB_PATH
# NOTE: We need to restore Microsoft.ML.OnnxRuntime.csproj manually to set IncludeMobileTargets=false
# docfx doesn't seem to be able to do that properly resulting in build errors
- name: Restore dependencies
@@ -50,10 +49,12 @@ jobs:
- name: Log source commit
run: git rev-parse --short HEAD > csharp/ApiDocs/csharp/source-version.txt
- name: Move C# docs into site
+ shell: pwsh
run: |
- mkdir -p _site/docs/api
- rm -rf _site/docs/api/csharp
- mv csharp/ApiDocs/csharp _site/docs/api/csharp
+ New-Item -Path _site/docs/api -Force -ItemType "Directory" | Out-Null
+ $OutputDirectory="_site/docs/api/csharp"
+ if (Test-Path $OutputDirectory) { Remove-Item -Recurse -Force $OutputDirectory }
+ Move-Item -Path csharp\ApiDocs\csharp -Destination $OutputDirectory
- name: Upload docs artifact
uses: actions/upload-artifact@v4
with:
diff --git a/.github/workflows/publish-gh-pages.yml b/.github/workflows/publish-gh-pages.yml
index 1818261b4b766..11745ce24f9e5 100644
--- a/.github/workflows/publish-gh-pages.yml
+++ b/.github/workflows/publish-gh-pages.yml
@@ -8,7 +8,7 @@ on:
jobs:
placeholder:
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- name: Placeholder step to have workflow included in the GitHub web UI
run: |
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index 9e42dca708a17..d04669a13aab7 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -8,7 +8,7 @@ on:
paths:
- java/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
@@ -21,7 +21,7 @@ permissions:
jobs:
build:
name: Generate Java docs
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
- name: Set up JDK 11
diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
index cec4a52d39c93..a6749b42adc35 100644
--- a/.github/workflows/publish-js-apidocs.yml
+++ b/.github/workflows/publish-js-apidocs.yml
@@ -8,7 +8,7 @@ on:
paths:
- js/common/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
@@ -21,7 +21,7 @@ permissions:
jobs:
build:
name: Generate JS API docs
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml
index a8b81c8d5cf84..deef64f73f15a 100644
--- a/.github/workflows/publish-objectivec-apidocs.yml
+++ b/.github/workflows/publish-objectivec-apidocs.yml
@@ -8,7 +8,7 @@ on:
paths:
- objectivec/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index 8b2f72d80bacf..adc2346d1bf1b 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -9,7 +9,7 @@ on:
- onnxruntime/python/**
- docs/python/**
schedule:
- - cron: '0 0 1 * *'
+ - cron: '0 0 1,15 * *'
workflow_dispatch:
concurrency:
@@ -22,7 +22,7 @@ permissions:
jobs:
build:
name: Generate Python API docs
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: actions/checkout@v4
- name: Install tools
@@ -32,10 +32,10 @@ jobs:
sudo apt-get install graphviz
- name: Install dependencies
run: |
- python3 -m pip install --upgrade pip
+ python3 -m pip install --user --upgrade pip
cd docs/python
- python3 -m pip install -r requirements.txt
- python3 -m pip install --pre onnxruntime-training -f https://download.onnxruntime.ai/onnxruntime_nightly_cpu.html
+ python3 -m pip install --user -r requirements.txt
+ python3 -m pip install --user --pre onnxruntime-training -f https://download.onnxruntime.ai/onnxruntime_nightly_cpu.html
python3 -m pip list
- name: Generate Python docs with Sphinx
run: |
diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
index 0867d4c343e91..51166293f06ac 100644
--- a/.github/workflows/sca.yml
+++ b/.github/workflows/sca.yml
@@ -30,7 +30,7 @@ jobs:
- uses: actions/setup-node@v4
with:
- node-version: 18
+ node-version: 20
- name: Download cuda
run: azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" cuda_sdk
@@ -57,6 +57,45 @@ jobs:
sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
category: VS_SCA
+ # With WebGPU, Without python
+ Onnxruntime-SCA-win32-WebGPU-x64:
+ permissions:
+ security-events: write
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: false
+ - uses: actions/setup-python@v5
+ with:
+ python-version: '3.11.x'
+ architecture: 'x64'
+
+ - uses: actions/setup-node@v4
+ with:
+ node-version: 20
+
+ - name: Delete build folder
+ run: |
+ if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
+
+
+ - name: Build code
+ env:
+ CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
+ run: python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_webgpu
+
+ - name: Generate sarif
+ working-directory: D:\b
+ run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
+
+ - name: Upload SARIF to GitHub
+ uses: github/codeql-action/upload-sarif@v3
+ continue-on-error: true
+ with:
+ sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
+ category: VS_SCA_WIN32_WEBGPU_X64
+
# No python
Onnxruntime-SCA-win32-WINML-x64:
permissions:
@@ -73,7 +112,7 @@ jobs:
- uses: actions/setup-node@v4
with:
- node-version: 18
+ node-version: 20
- name: Delete build folder
run: |
@@ -113,7 +152,7 @@ jobs:
- uses: actions/setup-node@v4
with:
- node-version: 18
+ node-version: 20
- name: Delete build folder
run: |
diff --git a/.github/workflows/skip-doc-change.yml.j2 b/.github/workflows/skip-doc-change.yml.j2
index 58f048122a87e..04f77e5d28713 100644
--- a/.github/workflows/skip-doc-change.yml.j2
+++ b/.github/workflows/skip-doc-change.yml.j2
@@ -14,7 +14,7 @@ jobs:
{%- for name in job_names %}
job{{ loop.index }}:
name: {{ name }}
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- run: 'echo "No build required, only documentation changed"'
{% endfor %}
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 181f3fb17d332..14cf0825873a0 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -8,7 +8,7 @@ on:
jobs:
close-stale-issues:
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
permissions:
issues: write
pull-requests: write
diff --git a/.github/workflows/title-only-labeler.yml b/.github/workflows/title-only-labeler.yml
index e0af2dd06b1b7..7ee9f3917a901 100644
--- a/.github/workflows/title-only-labeler.yml
+++ b/.github/workflows/title-only-labeler.yml
@@ -8,7 +8,7 @@ permissions:
jobs:
triage:
- runs-on: ubuntu-latest
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
steps:
- uses: github/issue-labeler@v3.4
with:
diff --git a/.lintrunner.toml b/.lintrunner.toml
index be46ba0baabdb..5ef9ad9337f57 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -2,31 +2,23 @@
# You can install the dependencies and initialize with
#
# ```sh
-# pip install lintrunner lintrunner-adapters
+# pip install -r requirements-lintrunner.txt
# lintrunner init
# ```
#
# This will install lintrunner on your system and download all the necessary
# dependencies to run linters locally.
-# If you want to see what lintrunner init will install, run
-# `lintrunner init --dry-run`.
#
-# To lint local changes:
+# To format local changes:
#
# ```bash
-# lintrunner
+# lintrunner -a
# ```
#
-# To lint all files:
+# To format all files:
#
# ```bash
-# lintrunner --all-files
-# ```
-#
-# To format files:
-#
-# ```bash
-# lintrunner f --all-files
+# lintrunner -a --all-files
# ```
#
# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 294bd926a34cb..b9932eb563b83 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
-
+
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index 3528545dfb06e..37fe2d378b7fd 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
-
+
diff --git a/CODEOWNERS b/CODEOWNERS
index f7dfa419500d0..a55067ed798d8 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -9,10 +9,6 @@
/onnxruntime/core/graph/contrib_ops/quantization_defs.* @microsoft/onnxruntime-mlas
/onnxruntime/core/mlas/** @microsoft/onnxruntime-mlas
-# build pipelines and workflows
-/tools/ci_build/github/azure-pipelines @microsoft/onnxruntime-es
-/.github/workflows @microsoft/onnxruntime-es
-
# Dependencies
requirements-dev.txt @microsoft/onnxruntime-admin
requirements-doc.txt @microsoft/onnxruntime-admin
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
new file mode 100644
index 0000000000000..12c1c7be0d773
--- /dev/null
+++ b/CPPLINT.cfg
@@ -0,0 +1 @@
+filter=-whitespace
diff --git a/README.md b/README.md
index cde039cec52a8..f1817282b61a0 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@
* **YouTube video tutorials**: [youtube.com/@ONNXRuntime](https://www.youtube.com/@ONNXRuntime)
-* [**Upcoming Release Roadmap**](https://github.com/microsoft/onnxruntime/wiki/Upcoming-Release-Roadmap)
+* [**Upcoming Release Roadmap**](https://onnxruntime.ai/roadmap)
* **Companion sample repositories**:
- ONNX Runtime Inferencing: [microsoft/onnxruntime-inference-examples](https://github.com/microsoft/onnxruntime-inference-examples)
@@ -24,8 +24,8 @@
|System|Inference|Training|
|---|---|---|
-|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)||
-|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining/orttraining-ortmodule-distributed?label=Training+Distributed)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=148)|
+|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CUDA%20CI%20Pipeline?label=Windows+GPU+CUDA)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=218)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20WebGPU%20CI%20Pipeline?label=Windows+GPU+WebGPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=228)||
+|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|
|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)||
|Android|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)||
|iOS|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)||
@@ -40,6 +40,12 @@ This project is tested with [BrowserStack](https://www.browserstack.com/home).
|---|---|---|
|Linux|[![Build Status](https://github.com/Ascend/onnxruntime/actions/workflows/build-and-test.yaml/badge.svg)](https://github.com/Ascend/onnxruntime/actions/workflows/build-and-test.yaml)||
+## Releases
+
+The current release and past releases can be found here: https://github.com/microsoft/onnxruntime/releases.
+
+For details on the upcoming release, including release dates, announcements, features, and guidance on submitting feature requests, please visit the release roadmap: https://onnxruntime.ai/roadmap.
+
## Data/Telemetry
Windows distributions of this project may collect usage data and send it to Microsoft to help improve our products and services. See the [privacy statement](docs/Privacy.md) for more details.
diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt
index 6a11f414361bd..26084ab42ec1c 100644
--- a/ThirdPartyNotices.txt
+++ b/ThirdPartyNotices.txt
@@ -2108,261 +2108,6 @@ SOFTWARE.
_____
-TVM Open Deep Learning Compiler Stack
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "{}"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright {yyyy} {name of copyright owner}
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-CONTRIBUTORS
-
-TVM Contributors
-================
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use,
-contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community.
-
-See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
-
-## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
-- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
-
-## Reviewers
-- [Zhi Chen](https://github.com/zhiics)
-- [Xiaoqiang Dan](https://github.com/xqdan)
-- [Liangfu Chen](https://github.com/liangfu)
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Jared Roesch](https://github.com/jroesch)
-- [Siva](https://github.com/srkreddy1238)
-- [Siju Samuel](https://github.com/siju-samuel)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Yao Wang](https://github.com/kevinthesun)
-- [Jian Weng](https://github.com/were)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
-
-## List of Contributors
-- [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
- - To contributors: please add your name to the list.
-- [Qiao Zhang](https://github.com/zhangqiaorjc)
-- [Haolong Zhang](https://github.com/haolongzhangm)
-- [Cody Hao Yu](https://github.com/comaniac)
-- [Chris Nuernberger](https://github.com/cnuernber)
-
-_____
-
FreeBSD: getopt.c file
Copyright (c) 1987, 1993, 1994
@@ -2492,212 +2237,6 @@ DAMAGE.
_____
-google/nsync
-
-Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-_____
-
google/re2
Copyright (c) 2009 The RE2 Authors. All rights reserved.
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 3989355915568..3500250a4b05b 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.20.0
+1.21.0
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 1432193ac9080..46349f43923e2 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -1,578 +1,508 @@
{
- "$schema": "https://json.schemastore.org/component-detection-manifest.json",
- "Registrations": [
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
- "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
- }
- }
- },
- {
- "component": {
- "Type": "maven",
- "maven": {
- "GroupId": "org.junit.platform",
- "ArtifactId": "junit-platform-console-standalone",
- "Version": "1.6.2"
- },
- "DevelopmentDependency": true
- }
- },
- {
- "component": {
- "Type": "maven",
- "maven": {
- "GroupId": "com.google.protobuf",
- "ArtifactId": "protobuf-java",
- "Version": "3.21.7"
- },
- "DevelopmentDependency": true
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "2379917985919ed3918dc12cad47f469f245be7a",
- "repositoryUrl": "https://github.com/apache/tvm.git"
- },
- "comments": "needed for TVM EP"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "cabe04d6d6b05356fa8f9741704924788f0dd762",
- "repositoryUrl": "https://github.com/agauniyal/rang.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "a3bcc6981d5dad3afb212689e2c7853d1b1ee45d",
- "repositoryUrl": "https://github.com/NVIDIA/cutlass.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "08f7c7e69f8ea61a0c4151359bc8023be8e9217b",
- "repositoryUrl": "https://github.com/tlc-pack/libbacktrace.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "36a91576edf633479c78649e050f18dd2ddc8103",
- "repositoryUrl": "https://github.com/apache/incubator-tvm-vta.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "111c9be5188f7350c2eac9ddaedd8cca3d7bf394",
- "repositoryUrl": "https://github.com/kazuho/picojson.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b5e4186d7ab63458e79084842dced166be2ca5b5",
- "repositoryUrl": "https://github.com/lammertb/libcrc.git"
- },
- "comments": "dependency from tvm"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
- "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
- "repositoryUrl": "https://github.com/dmlc/dlpack.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
- "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
- "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
- "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
- "repositoryUrl": "https://github.com/glennrp/libpng.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
- "repositoryUrl": "https://github.com/KjellKod/g3log.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
- "repositoryUrl": "https://github.com/madler/zlib.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
- "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
- "repositoryUrl": "https://github.com/numpy/numpy.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
- "repositoryUrl": "https://github.com/pytorch/pytorch.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
- "repositoryUrl": "https://github.com/pytorch/pytorch.git"
- },
- "comments": "pytorch 1.6 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
- "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
- "repositoryUrl": "https://github.com/BVLC/caffe.git"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "LLVM",
- "Version": "9.0.0",
- "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "FreeBSD GetOpt",
- "Version": "12.0.0",
- "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "Boost",
- "Version": "1.69.0",
- "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
- }
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
- "repositoryUrl": "https://github.com/grpc/grpc"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
- "repositoryUrl": "https://github.com/google/boringssl.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
- "repositoryUrl": "https://github.com/c-ares/c-ares.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
- "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
- "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "git": {
- "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
- "repositoryUrl": "https://github.com/google/upb.git"
- },
- "type": "git"
- }
- },
- {
- "component": {
- "type": "other",
- "Other": {
- "Name": "Go",
- "Version": "1.12.6",
- "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "OpenMPI",
- "Version": "4.0.0",
- "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
- }
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "OpenMPI",
- "Version": "4.0.4",
- "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
- },
- "comments": "openmpi 4.0.4 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "Type": "git",
- "git": {
- "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
- "repositoryUrl": "https://github.com/mpi4py/mpi4py"
- },
- "comments": "mpi4py 3.0.3 used by onnxruntime training image"
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "NCCL",
- "Version": "2.4.8",
- "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
- "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
- },
- "comments": "used by onnxruntime"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
- "repositoryUrl": "https://github.com/openucx/ucx"
- },
- "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
- "repositoryUrl": "https://github.com/nodejs/node"
- },
- "comments": "For Nodejs binding"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
- "repositoryUrl": "https://github.com/pypa/manylinux"
- },
- "comments": "For building our CI build docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c974557598645360fbabac71352b083117e3cc17",
- "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
- },
- "comments": "CMake 3.24.3. For building our CI build docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.9.1"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.8.7"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.7.9"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.6.12"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
- "repositoryUrl": "https://github.com/python/cpython"
- },
- "comments": "Python 3.5.10"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "transformers",
- "Version": "4.38.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "msgpack",
- "Version": "1.0.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "tensorboardX",
- "Version": "1.8"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "pip",
- "pip": {
- "Name": "tensorboard",
- "Version": "2.3.0"
- },
- "comments": "Installed in the training docker image"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
- "repositoryUrl": "https://github.com/aappleby/smhasher"
- },
- "comments": "MurmurHash3"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
- "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
- },
- "comments": "gtest-ios-framework"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
- "repositoryUrl": "https://github.com/dmlc/dlpack.git"
- },
- "comments": "dlpack"
- }
- },
- {
- "component": {
- "Type": "other",
- "Other": {
- "Name": "SQLite3",
- "Version": "3.22.0",
- "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
- }
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
- "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
- },
- "comments": "dlfcn-win32"
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
- "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
- },
- "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
- }
- },
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
- "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
- }
- }
- }
- ],
- "Version": 1
+ "$schema": "https://json.schemastore.org/component-detection-manifest.json",
+ "Registrations": [
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf",
+ "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "maven",
+ "maven": {
+ "GroupId": "org.junit.platform",
+ "ArtifactId": "junit-platform-console-standalone",
+ "Version": "1.6.2"
+ },
+ "DevelopmentDependency": true
+ }
+ },
+ {
+ "component": {
+ "Type": "maven",
+ "maven": {
+ "GroupId": "com.google.protobuf",
+ "ArtifactId": "protobuf-java",
+ "Version": "3.21.7"
+ },
+ "DevelopmentDependency": true
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c",
+ "repositoryUrl": "https://github.com/dmlc/HalideIR.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
+ "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8",
+ "repositoryUrl": "https://github.com/dmlc/dmlc-core.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20",
+ "repositoryUrl": "https://github.com/intel/mkl-dnn.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6",
+ "repositoryUrl": "https://github.com/tensorflow/tensorflow.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e",
+ "repositoryUrl": "https://github.com/glennrp/libpng.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0",
+ "repositoryUrl": "https://github.com/KjellKod/g3log.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "50893291621658f355bc5b4d450a8d06a563053d",
+ "repositoryUrl": "https://github.com/madler/zlib.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b",
+ "repositoryUrl": "https://github.com/Microsoft/CNTK.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92",
+ "repositoryUrl": "https://github.com/numpy/numpy.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2",
+ "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d",
+ "repositoryUrl": "https://github.com/pytorch/pytorch.git"
+ },
+ "comments": "pytorch 1.6 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660",
+ "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0",
+ "repositoryUrl": "https://github.com/BVLC/caffe.git"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "LLVM",
+ "Version": "9.0.0",
+ "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "FreeBSD GetOpt",
+ "Version": "12.0.0",
+ "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "Boost",
+ "Version": "1.69.0",
+ "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
+ }
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece",
+ "repositoryUrl": "https://github.com/grpc/grpc"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420",
+ "repositoryUrl": "https://github.com/google/boringssl.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce",
+ "repositoryUrl": "https://github.com/c-ares/c-ares.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0",
+ "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba",
+ "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "git": {
+ "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3",
+ "repositoryUrl": "https://github.com/google/upb.git"
+ },
+ "type": "git"
+ }
+ },
+ {
+ "component": {
+ "type": "other",
+ "Other": {
+ "Name": "Go",
+ "Version": "1.12.6",
+ "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "OpenMPI",
+ "Version": "4.0.0",
+ "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz"
+ }
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "OpenMPI",
+ "Version": "4.0.4",
+ "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz"
+ },
+ "comments": "openmpi 4.0.4 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "Type": "git",
+ "git": {
+ "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025",
+ "repositoryUrl": "https://github.com/mpi4py/mpi4py"
+ },
+ "comments": "mpi4py 3.0.3 used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "NCCL",
+ "Version": "2.4.8",
+ "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb",
+ "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml"
+ },
+ "comments": "used by onnxruntime"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7",
+ "repositoryUrl": "https://github.com/openucx/ucx"
+ },
+ "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277",
+ "repositoryUrl": "https://github.com/nodejs/node"
+ },
+ "comments": "For Nodejs binding"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3",
+ "repositoryUrl": "https://github.com/pypa/manylinux"
+ },
+ "comments": "For building our CI build docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c974557598645360fbabac71352b083117e3cc17",
+ "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake"
+ },
+ "comments": "CMake 3.24.3. For building our CI build docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.9.1"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.8.7"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.7.9"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.6.12"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f",
+ "repositoryUrl": "https://github.com/python/cpython"
+ },
+ "comments": "Python 3.5.10"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "transformers",
+ "Version": "4.38.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "msgpack",
+ "Version": "1.0.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "tensorboardX",
+ "Version": "1.8"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "pip",
+ "pip": {
+ "Name": "tensorboard",
+ "Version": "2.3.0"
+ },
+ "comments": "Installed in the training docker image"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56",
+ "repositoryUrl": "https://github.com/aappleby/smhasher"
+ },
+ "comments": "MurmurHash3"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3",
+ "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework"
+ },
+ "comments": "gtest-ios-framework"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
+ "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+ },
+ "comments": "dlpack"
+ }
+ },
+ {
+ "component": {
+ "Type": "other",
+ "Other": {
+ "Name": "SQLite3",
+ "Version": "3.22.0",
+ "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb"
+ }
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140",
+ "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git"
+ },
+ "comments": "dlfcn-win32"
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df",
+ "repositoryUrl": "https://github.com/python-pillow/Pillow.git"
+ },
+ "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel."
+ }
+ },
+ {
+ "component": {
+ "type": "git",
+ "git": {
+ "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
+ "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
+ }
+ }
+ }
+ ],
+ "Version": 1
}
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 55d5fae4dedcd..475f75b5bf19b 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -36,7 +36,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "f46495ea96f68fc3f6c394f099b2992743f6ff7f",
+ "commitHash": "4447c7562e3bc702ade25105912dce503f0c4010",
"repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
},
"comments": "abseil_cpp"
@@ -122,16 +122,6 @@
"comments": "google_benchmark"
}
},
- {
- "component": {
- "type": "git",
- "git": {
- "commitHash": "13de152c2a1cd73ff4df97bd2c406b6d15d34af3",
- "repositoryUrl": "https://github.com/google/nsync.git"
- },
- "comments": "google_nsync"
- }
- },
{
"component": {
"type": "git",
@@ -206,7 +196,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "9f98e2ebe7507fe0774d06a44bbf4b0e82cc9ce7",
+ "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
"repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
},
"comments": "onnx_tensorrt"
@@ -356,7 +346,7 @@
"component": {
"type": "git",
"git": {
- "commitHash": "511eb80847afe6bded34ec491a38d5d78ba2d604",
+ "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6",
"repositoryUrl": "https://github.com/google/dawn.git"
},
"comments": "dawn"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index c0772c33f6e5d..d2fe7e7457983 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -86,7 +86,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
# use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
-option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
@@ -102,10 +102,10 @@ option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to provide eigen_SOURCE_PATH if turn this on." OFF)
option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
-option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
+cmake_dependent_option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA; NOT WIN32" OFF)
option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
@@ -128,6 +128,10 @@ option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
+
+# When loading a delay loaded DLL, Windows searches the main EXE's folder first.
+# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled.
+cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF)
option(onnxruntime_USE_DML "Build with DirectML support" OFF)
option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
option(onnxruntime_USE_WINML "Build with WinML support" OFF)
@@ -140,13 +144,15 @@ option(onnxruntime_USE_TELEMETRY "Build with Telemetry" OFF)
cmake_dependent_option(onnxruntime_USE_MIMALLOC "Override new/delete and arena allocator with mimalloc" OFF "WIN32;NOT onnxruntime_USE_CUDA;NOT onnxruntime_USE_OPENVINO" OFF)
option(onnxruntime_USE_CANN "Build with CANN support" OFF)
option(onnxruntime_USE_ROCM "Build with AMD GPU support" OFF)
-option(onnxruntime_USE_TVM "Build with TVM support" OFF)
-option(onnxruntime_TVM_CUDA_RUNTIME "Build TVM with CUDA support" OFF)
-option(onnxruntime_TVM_USE_LLVM "Build TVM with LLVM. Set customized path to llvm-config.exe here if need" OFF)
-option(onnxruntime_TVM_USE_HASH "Build ipp-crypto library for support hash algorithm. It is defined for TVM only")
option(onnxruntime_USE_XNNPACK "Build with XNNPACK support. Provides an alternative math library on ARM, WebAssembly and x86." OFF)
option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware acceleration in web browsers." OFF)
option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
+option(onnxruntime_USE_EXTERNAL_DAWN "Build with treating Dawn as external dependency. Will not link Dawn at build time." OFF)
+option(onnxruntime_CUSTOM_DAWN_SRC_PATH "Path to custom Dawn src dir.")
+option(onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY "Build Dawn as a monolithic library" OFF)
+# The following 2 options are only for Windows
+option(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN "Enable Vulkan backend for Dawn (on Windows)" OFF)
+option(onnxruntime_ENABLE_DAWN_BACKEND_D3D12 "Enable D3D12 backend for Dawn (on Windows)" ON)
# Options related to reducing the binary size produced by the build
# XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -199,6 +205,7 @@ option(onnxruntime_WEBASSEMBLY_RUN_TESTS_IN_BROWSER "Enable this option to run t
option(onnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO "Enable this option to turn on DWARF format debug info" OFF)
option(onnxruntime_ENABLE_WEBASSEMBLY_PROFILING "Enable this option to turn on WebAssembly profiling and preserve function names" OFF)
option(onnxruntime_ENABLE_WEBASSEMBLY_OUTPUT_OPTIMIZED_MODEL "Enable this option to allow WebAssembly to output optimized model" OFF)
+option(onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64 "Enable this option to allow WebAssembly to use 64bit memory" OFF)
# Enable bitcode for iOS
option(onnxruntime_ENABLE_BITCODE "Enable bitcode for iOS only" OFF)
@@ -250,6 +257,7 @@ cmake_dependent_option(MSVC_Z7_OVERRIDE "replacing /Zi and /ZI with /Z7 when usi
option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
+option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
# ENABLE_TRAINING includes all training functionality
# The following 2 entry points
@@ -289,12 +297,50 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "ROCM does not support build with CUDA!")
endif()
+ # replicate strategy used by pytorch to get ROCM_VERSION
+ # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
+ # with modification
+ if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
+ message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
+ file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
+ string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
+ elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
+ message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm_version.h ****\n")
+ file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
+ string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+ elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
+ message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h ****\n")
+ file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
+ string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
+ endif()
+
+ if (ROCM_VERSION_MATCH)
+ set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
+ set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
+ set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
+ set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
+ math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+
+ message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
+ message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
+ message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
+ message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
+ message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
+ else()
+ message(FATAL_ERROR "Cannot determine ROCm version string")
+ endif()
+
+
if (NOT CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER "${onnxruntime_ROCM_HOME}/llvm/bin/clang++")
endif()
if (NOT CMAKE_HIP_ARCHITECTURES)
- set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101")
+ if (ROCM_VERSION_DEV VERSION_LESS "6.2")
+ message(FATAL_ERROR "CMAKE_HIP_ARCHITECTURES is not set when ROCm version < 6.2")
+ else()
+ set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx940;gfx941;gfx942;gfx1200;gfx1201")
+ endif()
endif()
file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*)
@@ -326,35 +372,6 @@ if (onnxruntime_USE_ROCM)
set(onnxruntime_HIPIFY_PERL ${HIPIFY_PERL_PATH}/hipify-perl)
endif()
- # replicate strategy used by pytorch to get ROCM_VERSION
- # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake
- # with modification
- if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version")
- file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW)
- string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW})
- elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h")
- file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW)
- string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
- elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h")
- file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW)
- string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW})
- endif()
-
- if (ROCM_VERSION_MATCH)
- set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
- set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
- set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
- set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
- math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
- else()
- message(FATAL_ERROR "Cannot determine ROCm version string")
- endif()
- message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n")
- message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
- message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
- message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
- message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
- message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
@@ -751,21 +768,30 @@ if (onnxruntime_USE_CUDA)
if (onnxruntime_DISABLE_CONTRIB_OPS)
set(onnxruntime_USE_FLASH_ATTENTION OFF)
+ set(onnxruntime_USE_LEAN_ATTENTION OFF)
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
endif()
+
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
message( STATUS "Turn off flash attention since CUDA compiler version < 11.6")
set(onnxruntime_USE_FLASH_ATTENTION OFF)
+ set(onnxruntime_USE_LEAN_ATTENTION OFF)
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
elseif(WIN32 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
message( STATUS "Flash-Attention unsupported in Windows with CUDA compiler version < 12.0")
set(onnxruntime_USE_FLASH_ATTENTION OFF)
endif()
+
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
endif()
+ if (WIN32)
+ message( STATUS "Lean Attention unsupported in Windows")
+ set(onnxruntime_USE_LEAN_ATTENTION OFF)
+ endif()
else()
set(onnxruntime_USE_FLASH_ATTENTION OFF)
+ set(onnxruntime_USE_LEAN_ATTENTION OFF)
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
endif()
@@ -779,6 +805,13 @@ if (onnxruntime_USE_CUDA)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_FLASH_ATTENTION=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_FLASH_ATTENTION=1)
endif()
+
+ if (onnxruntime_USE_LEAN_ATTENTION)
+ message( STATUS "Enable lean attention for CUDA EP")
+ list(APPEND ORT_PROVIDER_FLAGS -DUSE_LEAN_ATTENTION=1)
+ list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_LEAN_ATTENTION=1)
+ endif()
+
if (onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)
message( STATUS "Enable memory efficient attention for CUDA EP")
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
@@ -874,11 +907,6 @@ if (onnxruntime_USE_SNPE)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES snpe)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_SNPE=1)
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND ORT_PROVIDER_FLAGS -DUSE_TVM=1)
- list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_TVM=1)
- list(APPEND ONNXRUNTIME_PROVIDER_NAMES tvm)
-endif()
if (onnxruntime_USE_WINML)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_WINML=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WINML=1)
@@ -931,6 +959,18 @@ if (onnxruntime_USE_WEBGPU)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_WEBGPU=1)
list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBGPU=1)
list(APPEND ONNXRUNTIME_PROVIDER_NAMES webgpu)
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ list(APPEND ORT_PROVIDER_FLAGS -DBUILD_DAWN_MONOLITHIC_LIBRARY=1)
+ endif()
+ if (onnxruntime_USE_EXTERNAL_DAWN)
+ list(APPEND ORT_PROVIDER_FLAGS -DUSE_EXTERNAL_DAWN=1)
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+ list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_VULKAN=1)
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+ list(APPEND ORT_PROVIDER_FLAGS -DDAWN_ENABLE_D3D12=1)
+ endif()
endif()
if (onnxruntime_USE_CANN)
list(APPEND ORT_PROVIDER_FLAGS -DUSE_CANN=1)
@@ -946,6 +986,10 @@ if (onnxruntime_USE_LOCK_FREE_QUEUE)
add_compile_definitions(USE_LOCK_FREE_QUEUE)
endif()
+if (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+ add_compile_definitions(FORCE_GENERIC_ALGORITHMS)
+endif()
+
if (onnxruntime_ENABLE_LAZY_TENSOR)
# To support LazyTensor, ORT needs to call Python function from C/C++.
# so onnxruntime_ENABLE_PYTHON is required.
@@ -1065,8 +1109,6 @@ function(onnxruntime_set_compile_flags target_name)
if (CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang")
target_compile_options(${target_name} PRIVATE "-Wno-unused-function")
endif()
- target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11)
- onnxruntime_add_include_to_target(${target_name} nsync::nsync_cpp)
endif()
foreach(ORT_FLAG ${ORT_PROVIDER_FLAGS})
target_compile_definitions(${target_name} PRIVATE ${ORT_FLAG})
@@ -1280,50 +1322,6 @@ if (onnxruntime_USE_DNNL)
add_compile_definitions(DNNL_OPENMP)
endif()
-# TVM EP
-if (onnxruntime_USE_TVM)
- if (NOT TARGET tvm)
- message(STATUS "Include TVM(*).")
- include(tvm)
- endif()
-
- # ipp-crypto
- if (onnxruntime_TVM_USE_HASH)
- message(STATUS "Include ipp-crypto(*).")
- include(ipp-crypto)
- endif()
-
- # TVM
- if (onnxruntime_TVM_USE_LLVM)
- set(USE_LLVM "${onnxruntime_TVM_USE_LLVM}" CACHE STRING "Path to LLVM for correct TVM build")
- elseif(onnxruntime_USE_LLVM)
- set(USE_LLVM ON CACHE BOOL "Only defined for TVM")
- endif()
-
- if (onnxruntime_TVM_CUDA_RUNTIME)
- set(USE_CUDA ON CACHE BOOL "Only defined for TVM" FORCE)
- endif()
-
- # TODO(vvchernov): customized tvm logger is hidden due to the issue on TVM side (https://github.com/apache/tvm/issues/10139)
- # add_compile_definitions(TVM_LOG_CUSTOMIZE=1)
- # add_library(tvm_custom_logger STATIC ${ONNXRUNTIME_ROOT}/core/providers/tvm/custom_logging.cc)
-
- set(USE_OPENMP gnu CACHE STRING "Only defined for TVM")
- add_subdirectory(${tvm_SOURCE_DIR} ${tvm_BINARY_DIR} EXCLUDE_FROM_ALL)
-
- set_target_properties(tvm PROPERTIES FOLDER ${tvm_SOURCE_DIR})
- # target_link_libraries(tvm PUBLIC tvm_custom_logger)
-
- set(TVM_INCLUDES ${tvm_SOURCE_DIR}/include
- ${tvm_SOURCE_DIR}/3rdparty/dmlc-core/include
- ${tvm_SOURCE_DIR}/3rdparty/dlpack/include
- $)
-
- set(onnxruntime_tvm_libs onnxruntime_providers_tvm)
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm)
- list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm)
-endif()
-
# onnxruntime-extensions
if (onnxruntime_USE_EXTENSIONS)
include(extensions)
@@ -1334,7 +1332,7 @@ endif()
#Adjust warning flags
set_msvc_c_cpp_compiler_warning_level(4)
-set(onnxruntime_DELAYLOAD_FLAGS "")
+set(onnxruntime_DELAYLOAD_FLAGS )
include_directories(
${ONNXRUNTIME_INCLUDE_DIR}
@@ -1352,6 +1350,7 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DUSE_OPENVINO=1)
if(onnxruntime_NPU_NO_FALLBACK)
+ add_definitions(-DOPENVINO_CONFIG_NPU=1)
add_definitions(-DOPENVINO_DISABLE_NPU_FALLBACK=1)
endif()
@@ -1654,7 +1653,6 @@ if (WIN32)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES advapi32)
endif()
else()
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync::nsync_cpp)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ICONV_LIB} ${CMAKE_DL_LIBS} Threads::Threads)
endif()
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index c04d67ea4ce3f..dbbf685346532 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -60,6 +60,11 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
string(APPEND CMAKE_CXX_FLAGS " -s DISABLE_EXCEPTION_CATCHING=0")
endif()
+ if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
+ string(APPEND CMAKE_C_FLAGS " -DORT_WASM64")
+ string(APPEND CMAKE_CXX_FLAGS " -DORT_WASM64")
+ endif()
+
# Build WebAssembly with multi-threads support.
if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
string(APPEND CMAKE_C_FLAGS " -pthread -Wno-pthreads-mem-growth")
diff --git a/cmake/deps.txt b/cmake/deps.txt
index c1bb7ffe98a06..ed41ad5b0ceb1 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,7 +12,7 @@
# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
#
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/f46495ea96f68fc3f6c394f099b2992743f6ff7f.zip;0e2b6d1dc7f0a808d1e23f7dd985f7bc18d52cbc
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240722.0.zip;36ee53eb1466fb6e593fc5c286680de31f8a494a
coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
@@ -27,7 +27,6 @@ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip
fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177
-google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752
googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349
#xnnpack 2024.09.04
googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A
@@ -37,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.17.0.zip;13a60ac5217c104139ce0fd024f48628e7bcf5bc
-# Use the latest commit of 10.4-GA-ORT-DDS
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9f98e2ebe7507fe0774d06a44bbf4b0e82cc9ce7.zip;1d92137f424513bce20033ab4fb31cc0be8d1185
+# Use the latest commit of 10.6-GA-ORT-DDS
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
@@ -59,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d839
composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/511eb80847afe6bded34ec491a38d5d78ba2d604.zip;c493f5aca5586f6634e25d0121c85df71189fb99
+dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/v0.2.0/kleidiai-v0.2.0.zip;B1E3173992FD91F20DB904AB77D6E901778C2681
diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index dda7c5ff19ba4..7b6e2141eeb1b 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -27,7 +27,7 @@ FetchContent_Declare(
URL ${DEP_URL_abseil_cpp}
URL_HASH SHA1=${DEP_SHA1_abseil_cpp}
PATCH_COMMAND ${ABSL_PATCH_COMMAND}
- FIND_PACKAGE_ARGS NAMES absl
+ FIND_PACKAGE_ARGS 20240722 NAMES absl
)
onnxruntime_fetchcontent_makeavailable(abseil_cpp)
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
index a4fb63b6a8377..e995e215432a2 100644
--- a/cmake/external/abseil-cpp.natvis
+++ b/cmake/external/abseil-cpp.natvis
@@ -1,6 +1,6 @@
-
+
diff --git a/cmake/external/composable_kernel.cmake b/cmake/external/composable_kernel.cmake
index 4230eb8f4259b..b388a01209f4e 100644
--- a/cmake/external/composable_kernel.cmake
+++ b/cmake/external/composable_kernel.cmake
@@ -1,10 +1,12 @@
-set(PATCH ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)
+set(PATCH_CLANG ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)
+set(PATCH_GFX12X ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Add_gfx12x_support.patch)
include(FetchContent)
FetchContent_Declare(composable_kernel
URL ${DEP_URL_composable_kernel}
URL_HASH SHA1=${DEP_SHA1_composable_kernel}
- PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH}
+ PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_CLANG} &&
+ ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PATCH_GFX12X}
)
FetchContent_GetProperties(composable_kernel)
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index e03506de12728..3cfcdd4b04c62 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
- set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.2)
+ set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.4)
# Restore nuget packages, which will pull down the DirectML redist package.
add_custom_command(
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 339cded091b29..95dd438702a18 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -15,6 +15,7 @@ else ()
eigen
URL ${DEP_URL_eigen}
URL_HASH SHA1=${DEP_SHA1_eigen}
+ PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/eigen/eigen-edge.patch
)
endif()
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 85746027d4e8c..aeaaa7b51d595 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -86,27 +86,6 @@ if (onnxruntime_BUILD_BENCHMARKS)
onnxruntime_fetchcontent_makeavailable(google_benchmark)
endif()
-if (NOT WIN32)
- FetchContent_Declare(
- google_nsync
- URL ${DEP_URL_google_nsync}
- URL_HASH SHA1=${DEP_SHA1_google_nsync}
- PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/nsync/nsync_1.26.0.patch
- FIND_PACKAGE_ARGS NAMES nsync unofficial-nsync
- )
- #nsync tests failed on Mac Build
- set(NSYNC_ENABLE_TESTS OFF CACHE BOOL "" FORCE)
- onnxruntime_fetchcontent_makeavailable(google_nsync)
-
- if (google_nsync_SOURCE_DIR)
- add_library(nsync::nsync_cpp ALIAS nsync_cpp)
- target_include_directories(nsync_cpp PUBLIC ${google_nsync_SOURCE_DIR}/public)
- endif()
- if(TARGET unofficial::nsync::nsync_cpp AND NOT TARGET nsync::nsync_cpp)
- message(STATUS "Aliasing unofficial::nsync::nsync_cpp to nsync::nsync_cpp")
- add_library(nsync::nsync_cpp ALIAS unofficial::nsync::nsync_cpp)
- endif()
-endif()
if(onnxruntime_USE_MIMALLOC)
FetchContent_Declare(
@@ -636,17 +615,39 @@ if (onnxruntime_USE_COREML)
endif()
if (onnxruntime_USE_WEBGPU)
- FetchContent_Declare(
- dawn
- URL ${DEP_URL_dawn}
- URL_HASH SHA1=${DEP_SHA1_dawn}
- PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
- )
+ if (onnxruntime_CUSTOM_DAWN_SRC_PATH)
+ # use the custom dawn source path if provided
+ #
+ # specified as:
+ # build.py --use_webgpu --cmake_extra_defines "onnxruntime_CUSTOM_DAWN_SRC_PATH="
+ FetchContent_Declare(
+ dawn
+ SOURCE_DIR ${onnxruntime_CUSTOM_DAWN_SRC_PATH}
+ )
+ else()
+ FetchContent_Declare(
+ dawn
+ URL ${DEP_URL_dawn}
+ URL_HASH SHA1=${DEP_SHA1_dawn}
+ # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now.
+ # if we need to apply patches in the future, we can uncomment the following line.
+ # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+ )
+ endif()
+
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE BOOL "" FORCE)
+ set(DAWN_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
- # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
- set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+ if (onnxruntime_USE_EXTERNAL_DAWN)
+ message(FATAL_ERROR "onnxruntime_USE_EXTERNAL_DAWN and onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY cannot be enabled at the same time.")
+ endif()
+ else()
+ # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+ set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+ set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+ endif()
set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
- set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
@@ -675,13 +676,34 @@ if (onnxruntime_USE_WEBGPU)
set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
- # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
- set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+ if ((NOT onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) AND (NOT onnxruntime_ENABLE_DAWN_BACKEND_D3D12))
+ message(FATAL_ERROR "At least one of onnxruntime_ENABLE_DAWN_BACKEND_VULKAN or onnxruntime_ENABLE_DAWN_BACKEND_D3D12 must be enabled when using Dawn on Windows.")
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
+ set(DAWN_ENABLE_VULKAN ON CACHE BOOL "" FORCE)
+ set(TINT_BUILD_SPV_WRITER ON CACHE BOOL "" FORCE)
+ else()
+ set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+ endif()
+ if (onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+ set(DAWN_ENABLE_D3D12 ON CACHE BOOL "" FORCE)
+ else()
+ set(DAWN_ENABLE_D3D12 OFF CACHE BOOL "" FORCE)
+ endif()
+ # We are currently always using the D3D12 backend.
+ set(DAWN_ENABLE_D3D11 OFF CACHE BOOL "" FORCE)
endif()
onnxruntime_fetchcontent_makeavailable(dawn)
- list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native dawn::dawn_proc)
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::webgpu_dawn)
+ else()
+ if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native)
+ endif()
+ list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_proc)
+ endif()
endif()
set(onnxruntime_LINK_DIRS)
diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake
deleted file mode 100644
index 93049c8b85853..0000000000000
--- a/cmake/external/tvm.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-if (onnxruntime_USE_TVM)
- message(STATUS "onnxruntime_USE_TVM: Fetch tvm for TVM EP")
-
- FetchContent_Declare(
- tvm
- GIT_REPOSITORY https://github.com/apache/tvm.git
- GIT_TAG 2379917985919ed3918dc12cad47f469f245be7a
- )
-
- FetchContent_GetProperties(tvm)
- if(NOT tvm_POPULATED)
- FetchContent_Populate(tvm)
- if (WIN32)
- execute_process(
- COMMAND ${CMAKE_COMMAND} -E create_symlink ${tvm_BINARY_DIR}/${CMAKE_BUILD_TYPE} ${tvm_SOURCE_DIR}/build
- )
- else()
- file(CREATE_LINK ${tvm_BINARY_DIR} ${tvm_SOURCE_DIR}/build SYMBOLIC)
- endif()
- endif()
-
- set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include)
-
-endif()
diff --git a/cmake/hip_fatbin_insert b/cmake/hip_fatbin_insert
new file mode 100644
index 0000000000000..7d834cbf569f0
--- /dev/null
+++ b/cmake/hip_fatbin_insert
@@ -0,0 +1,7 @@
+SECTIONS {
+ .hipFatBinSegment : { *(.hipFatBinSegment) }
+} INSERT AFTER .bss
+
+SECTIONS {
+ .hip_fatbin : { *(.hip_fatbin) }
+} INSERT AFTER .hipFatBinSegment
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 08b8ca0cb66de..732c0511d400f 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -122,8 +122,12 @@ else()
else()
onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c )
endif()
- if (onnxruntime_USE_CUDA)
- set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
+ if(NOT APPLE)
+ include(CheckLinkerFlag)
+ check_linker_flag(CXX "LINKER:-rpath=\$ORIGIN" LINKER_SUPPORT_RPATH)
+ if(LINKER_SUPPORT_RPATH)
+ target_link_options(onnxruntime PRIVATE "LINKER:-rpath=\$ORIGIN")
+ endif()
endif()
endif()
@@ -139,17 +143,17 @@ target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\")
if(UNIX)
if (APPLE)
- set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker -dead_strip")
+ target_link_options(onnxruntime PRIVATE "LINKER:-dead_strip")
elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
- set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker --version-script=${SYMBOL_FILE} -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack")
+ target_link_options(onnxruntime PRIVATE "LINKER:--version-script=${SYMBOL_FILE}" "LINKER:--no-undefined" "LINKER:--gc-sections")
endif()
else()
- set(ONNXRUNTIME_SO_LINK_FLAG " -DEF:${SYMBOL_FILE}")
+ target_link_options(onnxruntime PRIVATE "-DEF:${SYMBOL_FILE}")
endif()
-if (NOT WIN32)
- if (APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "^iOS")
- set(ONNXRUNTIME_SO_LINK_FLAG " -Wl,-exported_symbols_list,${SYMBOL_FILE}")
+
+if (APPLE OR ${CMAKE_SYSTEM_NAME} MATCHES "^iOS")
+ target_link_options(onnxruntime PRIVATE "LINKER:-exported_symbols_list,${SYMBOL_FILE}")
if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
set_target_properties(onnxruntime PROPERTIES
MACOSX_RPATH TRUE
@@ -159,12 +163,10 @@ if (NOT WIN32)
else()
set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path")
endif()
- elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
- set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'")
- endif()
endif()
+
if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_MINIMAL_BUILD)
# target onnxruntime is a shared library, the dummy __cxa_demangle is only attach to it to avoid
# affecting downstream ort library users with the behavior of dummy __cxa_demangle. So the dummy
@@ -208,7 +210,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
${PROVIDERS_NNAPI}
${PROVIDERS_QNN}
${PROVIDERS_SNPE}
- ${PROVIDERS_TVM}
${PROVIDERS_RKNPU}
${PROVIDERS_VSINPU}
${PROVIDERS_XNNPACK}
@@ -219,7 +220,6 @@ set(onnxruntime_INTERNAL_LIBRARIES
${onnxruntime_winml}
onnxruntime_optimizer
onnxruntime_providers
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_graph
@@ -248,7 +248,9 @@ target_link_libraries(onnxruntime PRIVATE
${onnxruntime_EXTERNAL_LIBRARIES}
)
-set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
+if(WIN32)
+ target_link_options(onnxruntime PRIVATE ${onnxruntime_DELAYLOAD_FLAGS})
+endif()
#See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html
if(NOT APPLE AND NOT WIN32)
if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
@@ -393,8 +395,23 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
list(APPEND lib_and_dependencies ${cur_target})
- get_target_property(link_libraries ${cur_target} LINK_LIBRARIES)
- foreach(dependency ${link_libraries})
+ set(all_link_libraries)
+
+ get_property(link_libraries_set TARGET ${cur_target} PROPERTY LINK_LIBRARIES SET)
+ if(link_libraries_set)
+ get_target_property(link_libraries ${cur_target} LINK_LIBRARIES)
+ list(APPEND all_link_libraries ${link_libraries})
+ endif()
+
+ get_property(interface_link_libraries_set TARGET ${cur_target} PROPERTY INTERFACE_LINK_LIBRARIES SET)
+ if(interface_link_libraries_set)
+ get_target_property(interface_link_libraries ${cur_target} INTERFACE_LINK_LIBRARIES)
+ list(APPEND all_link_libraries ${interface_link_libraries})
+ endif()
+
+ list(REMOVE_DUPLICATES all_link_libraries)
+
+ foreach(dependency ${all_link_libraries})
if(TARGET ${dependency})
process(${dependency})
endif()
diff --git a/cmake/onnxruntime_codegen_tvm.cmake b/cmake/onnxruntime_codegen_tvm.cmake
deleted file mode 100644
index 7b50d8f8603ae..0000000000000
--- a/cmake/onnxruntime_codegen_tvm.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-file(GLOB_RECURSE onnxruntime_codegen_common_srcs
- "${ONNXRUNTIME_ROOT}/core/codegen/common/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/common/*.cc"
-)
-
-file(GLOB_RECURSE onnxruntime_codegen_tvm_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.cc"
- "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.h"
- "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.cc"
-)
-
-source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-
-#onnxruntime_codegen_tvm depends on onnxruntime framework
-onnxruntime_add_static_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs})
-set_target_properties(onnxruntime_codegen_tvm PROPERTIES FOLDER "ONNXRuntime")
-target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES} ${MKLML_INCLUDE_DIR} ${eigen_INCLUDE_DIRS})
-onnxruntime_add_include_to_target(onnxruntime_codegen_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11)
-target_compile_options(onnxruntime_codegen_tvm PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
-# need onnx to build to create headers that this project includes
-add_dependencies(onnxruntime_codegen_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
diff --git a/cmake/onnxruntime_csharp.cmake b/cmake/onnxruntime_csharp.cmake
index 22c993d07f7f9..39533429e181c 100644
--- a/cmake/onnxruntime_csharp.cmake
+++ b/cmake/onnxruntime_csharp.cmake
@@ -30,10 +30,6 @@ if (onnxruntime_USE_NNAPI_BUILTIN)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_NNAPI;")
endif()
-if (onnxruntime_USE_TVM)
- STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_TVM,")
-endif()
-
if (onnxruntime_USE_OPENVINO)
STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_OPENVINO;")
endif()
diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index 765ebab111ac7..b15b9632e9e24 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -7,7 +7,7 @@
include(FindJava)
find_package(Java REQUIRED)
include(UseJava)
-if (NOT CMAKE_SYSTEM_NAME STREQUAL "Android")
+if (NOT ANDROID)
find_package(JNI REQUIRED)
endif()
@@ -21,23 +21,28 @@ endif()
set(GRADLE_EXECUTABLE "${JAVA_ROOT}/gradlew")
+set(COMMON_GRADLE_ARGS --console=plain)
+if(WIN32)
+ list(APPEND COMMON_GRADLE_ARGS -Dorg.gradle.daemon=false)
+elseif (ANDROID)
+ # For Android build, we may run gradle multiple times in same build,
+ # sometimes gradle JVM will run out of memory if we keep the daemon running
+ # it is better to not keep a daemon running
+ list(APPEND COMMON_GRADLE_ARGS --no-daemon)
+endif()
+
# Specify the Java source files
file(GLOB_RECURSE onnxruntime4j_gradle_files "${JAVA_ROOT}/*.gradle")
file(GLOB_RECURSE onnxruntime4j_src "${JAVA_ROOT}/src/main/java/ai/onnxruntime/*.java")
set(JAVA_OUTPUT_JAR ${JAVA_ROOT}/build/libs/onnxruntime.jar)
# this jar is solely used to signaling mechanism for dependency management in CMake
# if any of the Java sources change, the jar (and generated headers) will be regenerated and the onnxruntime4j_jni target will be rebuilt
-set(GRADLE_ARGS --console=plain clean jar -x test)
-if(WIN32)
- set(GRADLE_ARGS ${GRADLE_ARGS} -Dorg.gradle.daemon=false)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Android")
- # For Android build, we may run gradle multiple times in same build,
- # sometimes gradle JVM will run out of memory if we keep the daemon running
- # it is better to not keep a daemon running
- set(GRADLE_ARGS ${GRADLE_ARGS} --no-daemon)
-endif()
+set(GRADLE_ARGS clean jar -x test)
-add_custom_command(OUTPUT ${JAVA_OUTPUT_JAR} COMMAND ${GRADLE_EXECUTABLE} ${GRADLE_ARGS} WORKING_DIRECTORY ${JAVA_ROOT} DEPENDS ${onnxruntime4j_gradle_files} ${onnxruntime4j_src})
+add_custom_command(OUTPUT ${JAVA_OUTPUT_JAR}
+ COMMAND ${GRADLE_EXECUTABLE} ${COMMON_GRADLE_ARGS} ${GRADLE_ARGS}
+ WORKING_DIRECTORY ${JAVA_ROOT}
+ DEPENDS ${onnxruntime4j_gradle_files} ${onnxruntime4j_src})
add_custom_target(onnxruntime4j DEPENDS ${JAVA_OUTPUT_JAR})
set_source_files_properties(${JAVA_OUTPUT_JAR} PROPERTIES GENERATED TRUE)
set_property(TARGET onnxruntime4j APPEND PROPERTY ADDITIONAL_CLEAN_FILES "${JAVA_OUTPUT_DIR}")
@@ -62,7 +67,7 @@ target_link_libraries(onnxruntime4j_jni PUBLIC onnxruntime)
set(JAVA_PACKAGE_OUTPUT_DIR ${JAVA_OUTPUT_DIR}/build)
file(MAKE_DIRECTORY ${JAVA_PACKAGE_OUTPUT_DIR})
-if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+if (ANDROID)
set(ANDROID_PACKAGE_OUTPUT_DIR ${JAVA_PACKAGE_OUTPUT_DIR}/android)
file(MAKE_DIRECTORY ${ANDROID_PACKAGE_OUTPUT_DIR})
endif()
@@ -88,7 +93,7 @@ if(APPLE)
elseif(JNI_ARCH STREQUAL "arm64")
set(JNI_ARCH aarch64)
endif()
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Android")
+elseif (ANDROID)
set(JNI_ARCH ${ANDROID_ABI})
elseif (ARM64)
set(JNI_ARCH aarch64)
@@ -180,15 +185,7 @@ else()
endif()
# run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR)
-set(GRADLE_ARGS --console=plain cmakeBuild -DcmakeBuildDir=${CMAKE_CURRENT_BINARY_DIR})
-if(WIN32)
- set(GRADLE_ARGS ${GRADLE_ARGS} -Dorg.gradle.daemon=false)
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Android")
- # For Android build, we may run gradle multiple times in same build,
- # sometimes gradle JVM will run out of memory if we keep the daemon running
- # it is better to not keep a daemon running
- set(GRADLE_ARGS ${GRADLE_ARGS} --no-daemon)
-endif()
+set(GRADLE_ARGS cmakeBuild -DcmakeBuildDir=${CMAKE_CURRENT_BINARY_DIR})
# Append relevant native build flags to gradle command
set(GRADLE_ARGS ${GRADLE_ARGS} ${ORT_PROVIDER_FLAGS})
@@ -197,9 +194,11 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
endif()
message(STATUS "GRADLE_ARGS: ${GRADLE_ARGS}")
-add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${GRADLE_EXECUTABLE} ${GRADLE_ARGS} WORKING_DIRECTORY ${JAVA_ROOT})
+add_custom_command(TARGET onnxruntime4j_jni POST_BUILD
+ COMMAND ${GRADLE_EXECUTABLE} ${COMMON_GRADLE_ARGS} ${GRADLE_ARGS}
+ WORKING_DIRECTORY ${JAVA_ROOT})
-if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+if (ANDROID)
set(ANDROID_PACKAGE_JNILIBS_DIR ${JAVA_OUTPUT_DIR}/android)
set(ANDROID_PACKAGE_ABI_DIR ${ANDROID_PACKAGE_JNILIBS_DIR}/${ANDROID_ABI})
file(MAKE_DIRECTORY ${ANDROID_PACKAGE_JNILIBS_DIR})
@@ -214,6 +213,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E echo "Generating Android AAR package..."
COMMAND ${GRADLE_EXECUTABLE}
+ ${COMMON_GRADLE_ARGS}
build
-b build-android.gradle -c settings-android.gradle
-DjniLibsDir=${ANDROID_PACKAGE_JNILIBS_DIR} -DbuildDir=${ANDROID_PACKAGE_OUTPUT_DIR}
@@ -237,6 +237,7 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android")
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E echo "Building and running Android test for Android AAR package..."
COMMAND ${GRADLE_EXECUTABLE}
+ ${COMMON_GRADLE_ARGS}
clean assembleDebug assembleDebugAndroidTest
-DminSdkVer=${ANDROID_MIN_SDK}
--stacktrace
diff --git a/cmake/onnxruntime_kernel_explorer.cmake b/cmake/onnxruntime_kernel_explorer.cmake
index 7de4f7b3f926b..62a6d45088052 100644
--- a/cmake/onnxruntime_kernel_explorer.cmake
+++ b/cmake/onnxruntime_kernel_explorer.cmake
@@ -64,7 +64,7 @@ elseif (onnxruntime_USE_ROCM)
)
auto_set_source_files_hip_language(${kernel_explorer_kernel_srcs} ${kernel_explorer_rocm_kernel_srcs})
target_sources(kernel_explorer PRIVATE ${kernel_explorer_rocm_kernel_srcs})
- target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1)
+ target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1 HIPBLAS_V2)
if (onnxruntime_USE_COMPOSABLE_KERNEL)
target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL)
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 0ba4694c329e3..5124262ec0004 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -36,11 +36,13 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/qpostprocessor.cpp
${MLAS_SRC_DIR}/qlgavgpool.cpp
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm.h
- ${MLAS_SRC_DIR}/sqnbitgemm.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm.h
+ ${MLAS_SRC_DIR}/qnbitgemm.cpp
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
${MLAS_SRC_DIR}/flashattn.cpp
${MLAS_SRC_DIR}/cast.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding.h
+ ${MLAS_SRC_DIR}/rotary_embedding.cpp
)
target_sources(onnxruntime_mlas PRIVATE
@@ -84,11 +86,15 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
- ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+ ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
)
set(mlas_platform_preprocess_srcs
@@ -362,10 +368,12 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
- ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h
+ ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -383,7 +391,9 @@ else()
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
- ${MLAS_SRC_DIR}/fp16_neon_common.cpp
+ ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
+ ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
+ ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
)
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -393,7 +403,9 @@ else()
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
- set_source_files_properties(${MLAS_SRC_DIR}/fp16_neon_common.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+ set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
endif()
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
@@ -453,7 +465,6 @@ else()
bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
return 0;
}
- }
#endif"
HAS_P10_RUNTIME
)
@@ -677,6 +688,13 @@ endif()
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs
"${MLAS_SRC_DIR}/scalar/*.cpp")
+ elseif (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+ file(GLOB_RECURSE mlas_platform_srcs_generic
+ "${MLAS_SRC_DIR}/scalar/*.cpp")
+ set(mlas_platform_srcs
+ ${mlas_platform_srcs}
+ ${mlas_platform_srcs_generic}
+ )
endif()
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
endif()
@@ -743,7 +761,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo)
endif()
if(NOT WIN32)
- target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
+ target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${CMAKE_DL_LIBS})
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs})
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 9666877cdc206..582491de9503d 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -101,9 +101,6 @@ endif()
if(onnxruntime_USE_ROCM)
set(PROVIDERS_ROCM onnxruntime_providers_rocm)
endif()
-if (onnxruntime_USE_TVM)
- set(PROVIDERS_TVM onnxruntime_providers_tvm)
-endif()
if (onnxruntime_USE_XNNPACK)
set(PROVIDERS_XNNPACK onnxruntime_providers_xnnpack)
endif()
@@ -194,10 +191,6 @@ if (onnxruntime_USE_ROCM)
include(onnxruntime_providers_rocm.cmake)
endif()
-if (onnxruntime_USE_TVM)
- include(onnxruntime_providers_tvm.cmake)
-endif()
-
if (onnxruntime_USE_VSINPU)
include(onnxruntime_providers_vsinpu.cmake)
endif()
diff --git a/cmake/onnxruntime_providers_cann.cmake b/cmake/onnxruntime_providers_cann.cmake
index 0e26f7ee3a57b..2b82379ed66a9 100644
--- a/cmake/onnxruntime_providers_cann.cmake
+++ b/cmake/onnxruntime_providers_cann.cmake
@@ -21,7 +21,7 @@
onnxruntime_add_include_to_target(onnxruntime_providers_cann onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
add_dependencies(onnxruntime_providers_cann onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
- target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser nsync::nsync_cpp ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED})
+ target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED})
target_link_directories(onnxruntime_providers_cann PRIVATE ${onnxruntime_CANN_HOME}/lib64)
target_include_directories(onnxruntime_providers_cann PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${onnxruntime_CANN_HOME} ${onnxruntime_CANN_HOME}/include)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 774b7a4f6bd77..4f86717026118 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -224,8 +224,7 @@
include(cutlass)
target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include)
- target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES}
- PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+ target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")
@@ -275,10 +274,8 @@
if(APPLE)
set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/cuda/exported_symbols.lst")
- target_link_libraries(${target} PRIVATE nsync::nsync_cpp)
elseif(UNIX)
set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/cuda/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(${target} PRIVATE nsync::nsync_cpp)
elseif(WIN32)
set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/cuda/symbols.def")
else()
diff --git a/cmake/onnxruntime_providers_dml.cmake b/cmake/onnxruntime_providers_dml.cmake
index 439be882dcc5e..3141aa85a1163 100644
--- a/cmake/onnxruntime_providers_dml.cmake
+++ b/cmake/onnxruntime_providers_dml.cmake
@@ -61,8 +61,9 @@
target_link_libraries(onnxruntime_providers_dml PRIVATE delayimp.lib)
- if (NOT GDK_PLATFORM)
- set(onnxruntime_DELAYLOAD_FLAGS "${onnxruntime_DELAYLOAD_FLAGS} /DELAYLOAD:DirectML.dll /DELAYLOAD:d3d12.dll /DELAYLOAD:dxgi.dll /DELAYLOAD:dxcore.dll /DELAYLOAD:api-ms-win-core-com-l1-1-0.dll /DELAYLOAD:shlwapi.dll /DELAYLOAD:oleaut32.dll /DELAYLOAD:ext-ms-win-dxcore-l1-*.dll /ignore:4199")
+ if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS AND NOT GDK_PLATFORM)
+ #NOTE: the flags are only applied to onnxruntime.dll and the PYD file in our python package. Our C/C++ unit tests do not use these flags.
+ list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:DirectML.dll" "/DELAYLOAD:d3d12.dll" "/DELAYLOAD:dxgi.dll" "/DELAYLOAD:dxcore.dll" "/DELAYLOAD:api-ms-win-core-com-l1-1-0.dll" "/DELAYLOAD:shlwapi.dll" "/DELAYLOAD:oleaut32.dll" "/DELAYLOAD:ext-ms-win-dxcore-l1-*.dll" "/ignore:4199")
endif()
target_compile_definitions(onnxruntime_providers_dml
diff --git a/cmake/onnxruntime_providers_dnnl.cmake b/cmake/onnxruntime_providers_dnnl.cmake
index f2965728524b7..9e5a7eed44fff 100644
--- a/cmake/onnxruntime_providers_dnnl.cmake
+++ b/cmake/onnxruntime_providers_dnnl.cmake
@@ -41,10 +41,8 @@
INSTALL_RPATH "@loader_path"
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE)
- target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp)
elseif(UNIX)
set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/dnnl/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\$ORIGIN")
- target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp)
elseif(WIN32)
set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/dnnl/symbols.def")
else()
diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake
index d7d83b0ce8d64..685e77bc483bd 100644
--- a/cmake/onnxruntime_providers_migraphx.cmake
+++ b/cmake/onnxruntime_providers_migraphx.cmake
@@ -57,7 +57,7 @@
endif()
if(UNIX)
set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs)
+ target_link_libraries(onnxruntime_providers_migraphx PRIVATE stdc++fs)
endif()
if (onnxruntime_ENABLE_TRAINING_OPS)
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
index 2eb3611bae902..f5fae8d169ccc 100644
--- a/cmake/onnxruntime_providers_openvino.cmake
+++ b/cmake/onnxruntime_providers_openvino.cmake
@@ -11,22 +11,22 @@
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
)
- if (WIN32)
- set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release)
- endif()
-
# Header paths
find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
- if(OpenVINO_VERSION VERSION_LESS 2024.0)
- message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release")
+ if(OpenVINO_VERSION VERSION_LESS 2024.4)
+ message(FATAL_ERROR "OpenVINO 2024.4 and newer are supported. Please, use latest OpenVINO release")
endif()
if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
add_definitions(-DUSE_OVEP_NPU_MEMORY=1)
endif()
- if (WIN32)
- unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
+ # If building RelWithDebInfo and OV package does not have that configuration map to Release
+ get_target_property(ov_rt_implib_rwdi openvino::runtime IMPORTED_IMPLIB_RELWITHDEBINFO)
+ if ((CMAKE_BUILD_TYPE STREQUAL RelWithDebInfo) AND NOT ov_rt_implib_rwdi)
+ set_target_properties(openvino::runtime PROPERTIES
+ MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release
+ )
endif()
list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES})
@@ -37,7 +37,7 @@
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs})
onnxruntime_add_shared_library_module(onnxruntime_providers_openvino ${onnxruntime_providers_openvino_cc_srcs} "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc")
- onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx)
+ onnxruntime_add_include_to_target(onnxruntime_providers_openvino onnxruntime_common onnx nlohmann_json::nlohmann_json)
install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
set_target_properties(onnxruntime_providers_openvino PROPERTIES CXX_STANDARD 20)
@@ -82,3 +82,8 @@
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
+
+set_target_properties(onnxruntime_providers_openvino PROPERTIES
+ MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo
+ MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo
+ )
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_rocm.cmake b/cmake/onnxruntime_providers_rocm.cmake
index 559204bd0df88..68f5319c0ae8d 100644
--- a/cmake/onnxruntime_providers_rocm.cmake
+++ b/cmake/onnxruntime_providers_rocm.cmake
@@ -8,7 +8,7 @@
find_package(HIP)
find_package(hiprand REQUIRED)
- find_package(rocblas REQUIRED)
+ find_package(hipblas REQUIRED)
find_package(MIOpen REQUIRED)
find_package(hipfft REQUIRED)
@@ -50,7 +50,7 @@
find_library(RCCL_LIB rccl REQUIRED)
find_library(ROCTRACER_LIB roctracer64 REQUIRED)
find_package(rocm_smi REQUIRED)
- set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen hip::hipfft ${ROCM_SMI_LIBRARY} ${RCCL_LIB} ${ROCTRACER_LIB})
+ set(ONNXRUNTIME_ROCM_LIBS roc::hipblas MIOpen hip::hipfft ${ROCM_SMI_LIBRARY} ${RCCL_LIB} ${ROCTRACER_LIB})
include_directories(${ROCM_SMI_INCLUDE_DIR})
link_directories(${ROCM_SMI_LIB_DIR})
@@ -116,6 +116,7 @@
auto_set_source_files_hip_language(${onnxruntime_providers_rocm_src})
onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
target_compile_options(onnxruntime_providers_rocm PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
+ target_link_options(onnxruntime_providers_rocm PRIVATE -T ${REPO_ROOT}/cmake/hip_fatbin_insert)
if(NOT MSVC)
target_compile_options(onnxruntime_providers_rocm PRIVATE -Wno-sign-compare)
@@ -154,6 +155,7 @@
set_target_properties(onnxruntime_providers_rocm PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers_rocm PROPERTIES FOLDER "ONNXRuntime")
+ target_compile_definitions(onnxruntime_providers_rocm PRIVATE HIPBLAS_V2)
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_providers_rocm PRIVATE ${ORTTRAINING_ROOT} ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining ${MPI_CXX_INCLUDE_DIRS})
@@ -215,7 +217,6 @@
if(UNIX)
set_property(TARGET onnxruntime_providers_rocm APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/rocm/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(onnxruntime_providers_rocm PRIVATE nsync::nsync_cpp)
else()
message(FATAL_ERROR "onnxruntime_providers_rocm unknown platform, need to specify shared library exports for it")
endif()
diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake
index 468aaa44ec4ee..7b18222f334f9 100644
--- a/cmake/onnxruntime_providers_tensorrt.cmake
+++ b/cmake/onnxruntime_providers_tensorrt.cmake
@@ -206,11 +206,9 @@
if(APPLE)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst")
- target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp)
elseif(UNIX)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections")
- target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp)
elseif(WIN32)
set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def")
else()
diff --git a/cmake/onnxruntime_providers_tvm.cmake b/cmake/onnxruntime_providers_tvm.cmake
deleted file mode 100644
index 8fd50c70dd5d7..0000000000000
--- a/cmake/onnxruntime_providers_tvm.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
- add_definitions(-DUSE_TVM=1)
- if (onnxruntime_TVM_USE_HASH)
- add_definitions(-DUSE_TVM_HASH=1)
- endif()
-
- if (onnxruntime_TVM_USE_HASH)
- file (GLOB_RECURSE onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
- )
- else()
- file (GLOB onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h"
- "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc"
- )
- endif()
-
- source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_tvm_cc_srcs})
- onnxruntime_add_static_library(onnxruntime_providers_tvm ${onnxruntime_providers_tvm_cc_srcs})
-
- if ( CMAKE_COMPILER_IS_GNUCC )
- target_compile_options(onnxruntime_providers_tvm PRIVATE -Wno-unused-parameter -Wno-missing-field-initializers)
- endif()
-
- target_include_directories(onnxruntime_providers_tvm PRIVATE
- ${TVM_INCLUDES}
- ${PYTHON_INCLUDE_DIRS})
- onnxruntime_add_include_to_target(onnxruntime_providers_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-
- add_dependencies(onnxruntime_providers_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES})
-
- if (onnxruntime_TVM_USE_HASH)
- add_dependencies(onnxruntime_providers_tvm ippcp_s)
- target_include_directories(onnxruntime_providers_tvm PRIVATE ${IPP_CRYPTO_INCLUDE_DIR})
- target_link_libraries(onnxruntime_providers_tvm PRIVATE ippcp_s)
- endif()
-
- set_target_properties(onnxruntime_providers_tvm PROPERTIES FOLDER "ONNXRuntime")
- set_target_properties(onnxruntime_providers_tvm PROPERTIES LINKER_LANGUAGE CXX)
-
- if (WIN32 AND MSVC)
- # wd4100: identifier' : unreferenced formal parameter
- # wd4127: conditional expression is constant
- # wd4244: conversion from 'int' to 'char', possible loss of data
- # TODO: 4244 should not be disabled
- target_compile_options(onnxruntime_providers_tvm PRIVATE "/wd4100" "/wd4127" "/wd4244")
- else()
- target_compile_options(onnxruntime_providers_tvm PRIVATE "-Wno-error=type-limits")
- endif()
- target_compile_definitions(onnxruntime_providers_tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=)
-
- install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
- DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
-
- if (NOT onnxruntime_BUILD_SHARED_LIB)
- install(TARGETS onnxruntime_providers_tvm
- ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
- LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
- FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
- endif()
\ No newline at end of file
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index 764cde9491da8..561a323533f48 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -12,6 +12,7 @@
file(GLOB onnxruntime_providers_vitisai_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/*.h"
+ "${ONNXRUNTIME_ROOT}/core/providers/vitisai/include/vaip/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.cc"
"${ONNXRUNTIME_ROOT}/core/providers/vitisai/imp/*.h"
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
diff --git a/cmake/onnxruntime_providers_vsinpu.cmake b/cmake/onnxruntime_providers_vsinpu.cmake
index 4b987fd1e424b..e3b6c3c302c82 100644
--- a/cmake/onnxruntime_providers_vsinpu.cmake
+++ b/cmake/onnxruntime_providers_vsinpu.cmake
@@ -11,7 +11,7 @@
add_library(onnxruntime_providers_vsinpu ${onnxruntime_providers_vsinpu_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_vsinpu
onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers Boost::mp11
- safeint_interface nsync::nsync_cpp)
+ safeint_interface )
add_dependencies(onnxruntime_providers_vsinpu ${onnxruntime_EXTERNAL_DEPENDENCIES})
set_target_properties(onnxruntime_providers_vsinpu PROPERTIES FOLDER "ONNXRuntime" LINKER_LANGUAGE CXX)
target_include_directories(onnxruntime_providers_vsinpu PRIVATE ${ONNXRUNTIME_ROOT} $ENV{TIM_VX_INSTALL}/include)
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index eb25c55ab23e0..fea5964f0dda9 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -22,6 +22,25 @@
onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
- target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native dawn::dawn_proc)
+
+ if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+ target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
+
+ if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+ list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+ endif()
+
+ # Copy webgpu_dawn.dll to the output directory
+ add_custom_command(
+ TARGET onnxruntime_providers_webgpu
+ POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$"
+ VERBATIM )
+ else()
+ if (NOT onnxruntime_USE_EXTERNAL_DAWN)
+ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
+ endif()
+ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
+ endif()
set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 0d038d210ea2b..5a87252b08573 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -110,17 +110,17 @@ if (onnxruntime_USE_NCCL)
endif()
if(APPLE)
- set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker -exported_symbols_list -Xlinker ${ONNXRUNTIME_ROOT}/python/exported_symbols.lst")
+ target_link_options(onnxruntime_pybind11_state PRIVATE "LINKER:-exported_symbols_list,${ONNXRUNTIME_ROOT}/python/exported_symbols.lst")
elseif(UNIX)
if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
- set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script_expose_onnx_protobuf.lds -Xlinker --gc-sections")
+ target_link_options(onnxruntime_pybind11_state PRIVATE "LINKER:--version-script=${ONNXRUNTIME_ROOT}/python/version_script_expose_onnx_protobuf.lds" "LINKER:--gc-sections")
else()
if (NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
- set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script.lds -Xlinker --gc-sections")
+ target_link_options(onnxruntime_pybind11_state PRIVATE "LINKER:--version-script=${ONNXRUNTIME_ROOT}/python/version_script.lds" "LINKER:--gc-sections")
endif()
endif()
else()
- set(ONNXRUNTIME_SO_LINK_FLAG "-DEF:${ONNXRUNTIME_ROOT}/python/pybind.def")
+ target_link_options(onnxruntime_pybind11_state PRIVATE "-DEF:${ONNXRUNTIME_ROOT}/python/pybind.def")
endif()
if (onnxruntime_ENABLE_ATEN)
@@ -169,8 +169,8 @@ endif()
target_link_libraries(onnxruntime_pybind11_state PRIVATE
onnxruntime_session
${onnxruntime_libs}
- ${PROVIDERS_TVM}
${PROVIDERS_NNAPI}
+ ${PROVIDERS_VSINPU}
${PROVIDERS_XNNPACK}
${PROVIDERS_COREML}
${PROVIDERS_RKNPU}
@@ -184,7 +184,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
onnxruntime_optimizer
onnxruntime_providers
onnxruntime_util
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_util
@@ -199,11 +198,11 @@ set(onnxruntime_pybind11_state_dependencies
${onnxruntime_EXTERNAL_DEPENDENCIES}
${pybind11_dep}
)
-set_property(TARGET onnxruntime_pybind11_state APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
+
add_dependencies(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_dependencies})
if (MSVC)
- set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG}")
+ target_link_options(onnxruntime_pybind11_state PRIVATE ${onnxruntime_DELAYLOAD_FLAGS})
# if MSVC, pybind11 undefines _DEBUG in pybind11/detail/common.h, which causes the pragma in pyconfig.h
# from the python installation to require the release version of the lib
# e.g. from a python 3.10 install:
@@ -220,14 +219,15 @@ if (MSVC)
# Explicitly use the release version of the python library to make the project file consistent with this.
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${Python_LIBRARY_RELEASE})
elseif (APPLE)
- set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG} -Xlinker -undefined -Xlinker dynamic_lookup")
+ # The following flag no longer works
+ #target_link_options(onnxruntime_pybind11_state PRIVATE "LINKER:-undefined,dynamic_lookup")
set_target_properties(onnxruntime_pybind11_state PROPERTIES
INSTALL_RPATH "@loader_path"
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE)
else()
if (NOT CMAKE_SYSTEM_NAME MATCHES "AIX")
- set_property(TARGET onnxruntime_pybind11_state APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
+ target_link_options(onnxruntime_pybind11_state PRIVATE "LINKER:-rpath=\$ORIGIN")
endif()
endif()
@@ -238,8 +238,8 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
MATH(EXPR PROTOBUF_INDEX_NEXT "${PROTOBUF_INDEX} + 1")
if (ONNX_INDEX GREATER_EQUAL 0 AND PROTOBUF_INDEX GREATER_EQUAL 0)
# Expect protobuf to follow onnx due to dependence
- list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${ONNX_INDEX} "-Wl,--no-as-needed")
- list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${PROTOBUF_INDEX_NEXT} "-Wl,--as-needed")
+ list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${ONNX_INDEX} "LINKER:--no-as-needed")
+ list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${PROTOBUF_INDEX_NEXT} "LINKER:--as-needed")
else()
message(FATAL_ERROR "Required external libraries onnx and protobuf are not found in onnxruntime_EXTERNAL_LIBRARIES")
endif()
@@ -964,37 +964,6 @@ if (onnxruntime_USE_ROCM)
)
endif()
-if (onnxruntime_USE_TVM)
- file(GLOB onnxruntime_python_providers_tvm_srcs CONFIGURE_DEPENDS
- "${ONNXRUNTIME_ROOT}/python/providers/tvm/*.py"
- )
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers
- COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers/tvm
- COMMAND ${CMAKE_COMMAND} -E copy
- ${onnxruntime_python_providers_tvm_srcs}
- $/onnxruntime/providers/tvm
- COMMAND ${CMAKE_COMMAND} -E copy
- $
- $/onnxruntime/capi/
- )
-
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- WORKING_DIRECTORY ${tvm_SOURCE_DIR}/python
- COMMAND ${Python_EXECUTABLE} setup.py bdist_wheel
- )
-
- add_custom_command(
- TARGET onnxruntime_pybind11_state POST_BUILD
- COMMAND ${Python_EXECUTABLE}
- $/onnxruntime/providers/tvm/extend_python_file.py
- --target_file $/onnxruntime/capi/_ld_preload.py
- )
-
-endif()
-
if (onnxruntime_USE_DML)
if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${onnxruntime_target_platform}-win/${DML_SHARED_LIB})
@@ -1050,4 +1019,13 @@ if (onnxruntime_USE_QNN)
endif()
endif()
+if (onnxruntime_USE_VSINPU)
+ add_custom_command(
+ TARGET onnxruntime_pybind11_state POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy
+ $
+ $/onnxruntime/capi/
+ )
+endif()
+
endif()
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index fcddd2a51e0d1..111033c780712 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -157,10 +157,6 @@ set(provider_excluded_files
"cuda_execution_provider_info.h"
"cuda_execution_provider.cc"
"cuda_execution_provider.h"
- "cuda_memory_check.cc"
- "cuda_memory_check.h"
- "cuda_fence.cc"
- "cuda_fence.h"
"cuda_kernel.h"
"cuda_pch.cc"
"cuda_pch.h"
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 619c3a784d5f9..e822f0a3655fc 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -9,9 +9,6 @@ set(TEST_INC_DIR ${ONNXRUNTIME_ROOT})
if (onnxruntime_ENABLE_TRAINING)
list(APPEND TEST_INC_DIR ${ORTTRAINING_ROOT})
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND TEST_INC_DIR ${TVM_INCLUDES})
-endif()
set(disabled_warnings)
function(AddTest)
@@ -67,7 +64,10 @@ function(AddTest)
if(onnxruntime_USE_CUDA)
#XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs,
# otherwise it will impact when CUDA DLLs can be unloaded.
- target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart cudnn_frontend)
+ target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart)
+ if(NOT onnxruntime_CUDA_MINIMAL)
+ target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend)
+ endif()
endif()
target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES})
endif()
@@ -111,7 +111,6 @@ function(AddTest)
endif()
target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings})
else()
- target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM})
target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options -Wno-error=sign-compare>"
"$<$>:-Wno-error=sign-compare>")
if (${HAS_NOERROR})
@@ -134,9 +133,14 @@ function(AddTest)
if (IOS)
# target_sources(${_UT_TARGET} PRIVATE ${TEST_SRC_DIR}/xctest/orttestmain.m)
+
+ set(_UT_IOS_BUNDLE_GUI_IDENTIFIER com.onnxruntime.utest.${_UT_TARGET})
+ # replace any characters that are not valid in a bundle identifier with '-'
+ string(REGEX REPLACE "[^a-zA-Z0-9\\.-]" "-" _UT_IOS_BUNDLE_GUI_IDENTIFIER ${_UT_IOS_BUNDLE_GUI_IDENTIFIER})
+
set_target_properties(${_UT_TARGET} PROPERTIES FOLDER "ONNXRuntimeTest"
MACOSX_BUNDLE_BUNDLE_NAME ${_UT_TARGET}
- MACOSX_BUNDLE_GUI_IDENTIFIER com.onnxruntime.utest.${_UT_TARGET}
+ MACOSX_BUNDLE_GUI_IDENTIFIER ${_UT_IOS_BUNDLE_GUI_IDENTIFIER}
MACOSX_BUNDLE_LONG_VERSION_STRING ${ORT_VERSION}
MACOSX_BUNDLE_BUNDLE_VERSION ${ORT_VERSION}
MACOSX_BUNDLE_SHORT_VERSION_STRING ${ORT_VERSION}
@@ -163,13 +167,31 @@ function(AddTest)
set_target_properties(${_UT_TARGET}_xc PROPERTIES FOLDER "ONNXRuntimeXCTest"
MACOSX_BUNDLE_BUNDLE_NAME ${_UT_TARGET}_xc
- MACOSX_BUNDLE_GUI_IDENTIFIER com.onnxruntime.utest.${_UT_TARGET}
+ MACOSX_BUNDLE_GUI_IDENTIFIER ${_UT_IOS_BUNDLE_GUI_IDENTIFIER}
MACOSX_BUNDLE_LONG_VERSION_STRING ${ORT_VERSION}
MACOSX_BUNDLE_BUNDLE_VERSION ${ORT_VERSION}
MACOSX_BUNDLE_SHORT_VERSION_STRING ${ORT_VERSION}
XCODE_ATTRIBUTE_ENABLE_BITCODE "NO")
- xctest_add_test(xctest.${_UT_TARGET} ${_UT_TARGET}_xc)
+ # This is a workaround for an Xcode 16 / CMake issue:
+ # error: Multiple commands produce '/Debug/Debug-iphonesimulator/onnxruntime_test_all.app/PlugIns'
+ # note: CreateBuildDirectory /Debug/Debug-iphonesimulator/onnxruntime_test_all.app/PlugIns
+ # note: Target 'onnxruntime_test_all' (project 'onnxruntime') has create directory command with output
+ # '/Debug/Debug-iphonesimulator/onnxruntime_test_all.app/PlugIns'
+ #
+ # It seems related to the test target (e.g., onnxruntime_test_all_xc) LIBRARY_OUTPUT_DIRECTORY property getting set
+ # to "$/PlugIns" in xctest_add_bundle():
+ # https://github.com/Kitware/CMake/blob/9c4a0a9ff09735b847bbbc38caf6da7f6c7238f2/Modules/FindXCTest.cmake#L159-L168
+ #
+ # This is the related CMake issue: https://gitlab.kitware.com/cmake/cmake/-/issues/26301
+ #
+ # Unsetting LIBRARY_OUTPUT_DIRECTORY avoids the build error.
+ set_property(TARGET ${_UT_TARGET}_xc PROPERTY LIBRARY_OUTPUT_DIRECTORY)
+
+ # Don't bother calling xctest_add_test() because we don't use CTest to run tests on iOS.
+ # Instead, we can call 'xcodebuild test-without-building' and specify a '-destination' referring to an iOS
+ # simulator or device.
+ # xctest_add_test(xctest.${_UT_TARGET} ${_UT_TARGET}_xc)
else()
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
# We might have already executed the following "find_program" code when we build ORT nodejs binding.
@@ -500,6 +522,9 @@ set (onnxruntime_global_thread_pools_test_SRC
${ONNXRUNTIME_GLOBAL_THREAD_POOLS_TEST_SRC_DIR}/test_main.cc
${ONNXRUNTIME_GLOBAL_THREAD_POOLS_TEST_SRC_DIR}/test_inference.cc)
+set (onnxruntime_webgpu_external_dawn_test_SRC
+ ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
+
# tests from lowest level library up.
# the order of libraries should be maintained, with higher libraries being added first in the list
@@ -615,13 +640,11 @@ set(ONNXRUNTIME_TEST_LIBS
${PROVIDERS_ACL}
${PROVIDERS_ARMNN}
${PROVIDERS_COREML}
- # ${PROVIDERS_TVM}
${PROVIDERS_XNNPACK}
${PROVIDERS_AZURE}
onnxruntime_optimizer
onnxruntime_providers
onnxruntime_util
- ${onnxruntime_tvm_libs}
onnxruntime_lora
onnxruntime_framework
onnxruntime_util
@@ -723,12 +746,6 @@ if(onnxruntime_USE_AZURE)
list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_azure)
endif()
-if(WIN32)
- if (onnxruntime_USE_TVM)
- list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM})
- endif()
-endif()
-
file(GLOB onnxruntime_test_framework_src CONFIGURE_DEPENDS
${onnxruntime_test_framework_src_patterns}
)
@@ -743,9 +760,7 @@ if(MSVC)
target_compile_options(onnxruntime_test_utils PRIVATE "$<$:SHELL:--compiler-options /wd6326>"
"$<$>:/wd6326>")
else()
- target_compile_definitions(onnxruntime_test_utils PUBLIC -DNSYNC_ATOMIC_CPP11)
target_include_directories(onnxruntime_test_utils PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
- onnxruntime_add_include_to_target(onnxruntime_test_utils nsync::nsync_cpp)
endif()
if (onnxruntime_USE_NCCL)
target_include_directories(onnxruntime_test_utils PRIVATE ${NCCL_INCLUDE_DIRS})
@@ -779,9 +794,7 @@ if(NOT IOS)
target_compile_options(onnx_test_runner_common PRIVATE "$<$:SHELL:--compiler-options /utf-8>"
"$<$>:/utf-8>")
else()
- target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11)
target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
- onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp)
endif()
if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
#TODO: fix the warnings, they are dangerous
@@ -833,9 +846,6 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
list(APPEND all_tests ${onnxruntime_test_training_api_src})
endif()
-if (onnxruntime_USE_TVM)
- list(APPEND all_tests ${onnxruntime_test_tvm_src})
-endif()
if (onnxruntime_USE_OPENVINO)
list(APPEND all_tests ${onnxruntime_test_openvino_src})
@@ -1067,15 +1077,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $
)
endif()
- if(WIN32)
- if (onnxruntime_USE_TVM)
- add_custom_command(
- TARGET ${test_data_target} POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy $ $
- )
- endif()
- endif()
-
if(WIN32)
set(wide_get_opt_src_dir ${TEST_SRC_DIR}/win_getopt/wide)
onnxruntime_add_static_library(win_getopt_wide ${wide_get_opt_src_dir}/getopt.cc ${wide_get_opt_src_dir}/include/getopt.h)
@@ -1117,12 +1118,6 @@ if (NOT IOS)
endif()
set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest")
- if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnx_test_runner PRIVATE "/STACK:4000000")
- endif()
- endif()
-
install(TARGETS onnx_test_runner
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -1146,7 +1141,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
${BENCHMARK_DIR}/gelu.cc
${BENCHMARK_DIR}/activation.cc
${BENCHMARK_DIR}/quantize.cc
- ${BENCHMARK_DIR}/reduceminmax.cc)
+ ${BENCHMARK_DIR}/reduceminmax.cc
+ ${BENCHMARK_DIR}/layer_normalization.cc)
target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE)
if(WIN32)
@@ -1183,7 +1179,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
# "Global initializer calls a non-constexpr function." BENCHMARK_CAPTURE macro needs this.
target_compile_options(onnxruntime_mlas_benchmark PRIVATE /wd26426)
else()
- target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
+ target_link_libraries(onnxruntime_mlas_benchmark PRIVATE ${CMAKE_DL_LIBS})
endif()
if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo)
@@ -1256,7 +1252,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
${onnxruntime_EXTERNAL_LIBRARIES}
${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
if(NOT WIN32)
- list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp)
if(onnxruntime_USE_SNPE)
list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe)
endif()
@@ -1276,11 +1271,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()
set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
- if (onnxruntime_USE_TVM)
- if (WIN32)
- target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000")
- endif()
- endif()
endif()
@@ -1324,7 +1314,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
# test inference using shared lib
set(onnxruntime_shared_lib_test_LIBS onnxruntime_mocked_allocator onnxruntime_test_utils onnxruntime_common onnx_proto)
if(NOT WIN32)
- list(APPEND onnxruntime_shared_lib_test_LIBS nsync::nsync_cpp)
if(onnxruntime_USE_SNPE)
list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_providers_snpe)
endif()
@@ -1473,7 +1462,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_link_libraries(onnxruntime_mlas_test PRIVATE cpuinfo)
endif()
if(NOT WIN32)
- target_link_libraries(onnxruntime_mlas_test PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
+ target_link_libraries(onnxruntime_mlas_test PRIVATE ${CMAKE_DL_LIBS})
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
target_link_libraries(onnxruntime_mlas_test PRIVATE ${android_shared_libs})
@@ -1659,9 +1648,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
${ONNXRUNTIME_CUSTOM_OP_REGISTRATION_TEST_SRC_DIR}/test_registercustomops.cc)
set(onnxruntime_customopregistration_test_LIBS custom_op_library onnxruntime_common onnxruntime_test_utils)
- if (NOT WIN32)
- list(APPEND onnxruntime_customopregistration_test_LIBS nsync::nsync_cpp)
- endif()
+
if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
list(APPEND onnxruntime_customopregistration_test_LIBS cpuinfo)
endif()
@@ -1669,7 +1656,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
- list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto nsync_cpp)
+ list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto)
endif()
AddTest(DYN
TARGET onnxruntime_customopregistration_test
@@ -1788,11 +1775,11 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten"
set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils)
if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
- list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_lora onnxruntime_framework onnxruntime_common onnxruntime_graph onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto nsync_cpp)
+ list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_lora onnxruntime_framework onnxruntime_common onnxruntime_graph onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto)
endif()
if(NOT WIN32)
- list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS})
+ list(APPEND onnxruntime_logging_apis_test_LIBS ${CMAKE_DL_LIBS})
endif()
AddTest(DYN
@@ -1868,4 +1855,13 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
endif()
endif()
+if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
+ AddTest(TARGET onnxruntime_webgpu_external_dawn_test
+ SOURCES ${onnxruntime_webgpu_external_dawn_test_SRC}
+ LIBS dawn::dawn_native ${onnxruntime_test_providers_libs}
+ DEPENDS ${all_dependencies}
+ )
+ onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
+endif()
+
include(onnxruntime_fuzz_test.cmake)
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 3a1576065205f..66268cefac9ef 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -97,7 +97,6 @@ target_compile_options(onnx PRIVATE -Wno-unused-parameter -Wno-unused-variable)
if (onnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB)
bundle_static_library(onnxruntime_webassembly
- nsync::nsync_cpp
${PROTOBUF_LIB}
onnx
onnx_proto
@@ -175,7 +174,6 @@ else()
endif()
target_link_libraries(onnxruntime_webassembly PRIVATE
- nsync::nsync_cpp
${PROTOBUF_LIB}
onnx
onnx_proto
@@ -194,9 +192,7 @@ else()
onnxruntime_util
re2::re2
)
-
- set(EXPORTED_RUNTIME_METHODS "'stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8'")
-
+ set(EXPORTED_RUNTIME_METHODS "'stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8','getValue','setValue'")
if (onnxruntime_USE_XNNPACK)
target_link_libraries(onnxruntime_webassembly PRIVATE XNNPACK)
string(APPEND EXPORTED_RUNTIME_METHODS ",'addFunction'")
@@ -217,10 +213,114 @@ else()
set(EXPORTED_FUNCTIONS "_malloc,_free")
endif()
+ if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
+ set(MAXIMUM_MEMORY "17179869184")
+ target_link_options(onnxruntime_webassembly PRIVATE
+ "SHELL:-s MEMORY64=1"
+ )
+ string(APPEND CMAKE_C_FLAGS " -sMEMORY64 -Wno-experimental")
+ string(APPEND CMAKE_CXX_FLAGS " -sMEMORY64 -Wno-experimental")
+ set(SMEMORY_FLAG "-sMEMORY64")
+
+ target_compile_options(onnx PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_common PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_session PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_framework PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(nsync_cpp PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnx_proto PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ # target_compile_options(protoc PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(libprotobuf-lite PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_providers PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_optimizer PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_mlas PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_optimizer PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_graph PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_flatbuffers PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(onnxruntime_util PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(re2 PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_private_handle_accessor PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_commandlineflag PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_commandlineflag_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_marshalling PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_reflection PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_config PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_flags_program_name PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_cord PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_cordz_info PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_cord_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_cordz_functions PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_cordz_handle PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_crc_cord_state PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_crc32c PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_crc_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_crc_cpu_detect PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_raw_hash_set PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_hashtablez_sampler PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_exponential_biased PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_conditions PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_check_op PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_message PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_format PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_str_format_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_log_sink_set PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_globals PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_sink PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_entry PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_globals PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_hash PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_city PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_low_level_hash PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_bad_variant_access PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_vlog_config_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_synchronization PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_kernel_timeout_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_time PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_time_zone PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_civil_time PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_graphcycles_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_bad_optional_access PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_fnmatch PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_examine_stack PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_symbolize PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_malloc_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_demangle_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_demangle_rust PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_decode_rust_punycode PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_utf8_for_code_point PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_stacktrace PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_debugging_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_proto PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_strerror PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_internal_nullguard PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_strings PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_strings_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_int128 PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_string_view PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_base PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_spinlock_wait PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_throw_delegate PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_raw_logging_internal PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(absl_log_severity PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ if (onnxruntime_USE_EXTENSIONS)
+ target_compile_options(ortcustomops PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(ocos_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ target_compile_options(noexcep_operators PRIVATE ${SMEMORY_FLAG} -Wno-experimental)
+ endif()
+ target_link_options(onnxruntime_webassembly PRIVATE
+ --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js_64.js"
+ )
+ else ()
+ set(MAXIMUM_MEMORY "4294967296")
+ target_link_options(onnxruntime_webassembly PRIVATE
+ --post-js "${ONNXRUNTIME_ROOT}/wasm/js_post_js.js"
+ )
+ endif ()
+
target_link_options(onnxruntime_webassembly PRIVATE
"SHELL:-s EXPORTED_RUNTIME_METHODS=[${EXPORTED_RUNTIME_METHODS}]"
"SHELL:-s EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}"
- "SHELL:-s MAXIMUM_MEMORY=4294967296"
+ "SHELL:-s MAXIMUM_MEMORY=${MAXIMUM_MEMORY}"
"SHELL:-s EXIT_RUNTIME=0"
"SHELL:-s ALLOW_MEMORY_GROWTH=1"
"SHELL:-s MODULARIZE=1"
@@ -233,6 +333,41 @@ else()
--no-entry
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre.js\""
)
+ if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
+ set(SIGNATURE_CONVERSIONS "OrtRun:_pppppppp,\
+OrtRunWithBinding:_ppppp,\
+OrtGetTensorData:_ppppp,\
+OrtCreateTensor:p_pppp_,\
+OrtCreateSession:pppp,\
+OrtReleaseSession:_p,\
+OrtGetInputOutputCount:_ppp,\
+OrtCreateSessionOptions:pp__p_ppppp,\
+OrtReleaseSessionOptions:_p,\
+OrtAppendExecutionProvider:_pp,\
+OrtAddSessionConfigEntry:_ppp,\
+OrtGetInputName:ppp,\
+OrtGetOutputName:ppp,\
+OrtCreateRunOptions:ppp_p,\
+OrtReleaseRunOptions:_p,\
+OrtReleaseTensor:_p,\
+OrtFree:_p,\
+OrtCreateBinding:_p,\
+OrtBindInput:_ppp,\
+OrtBindOutput:_ppp_,\
+OrtClearBoundOutputs:_p,\
+OrtReleaseBinding:_p,\
+OrtGetLastError:_pp,\
+JsepOutput:pp_p,\
+JsepGetNodeName:pp,\
+JsepOutput:pp_p,\
+jsepCopy:_pp_,\
+jsepCopyAsync:_pp_,\
+jsepDownload:_pp_")
+ target_link_options(onnxruntime_webassembly PRIVATE
+ "SHELL:-s ERROR_ON_UNDEFINED_SYMBOLS=0"
+ "SHELL:-s SIGNATURE_CONVERSIONS='${SIGNATURE_CONVERSIONS}'"
+ )
+ endif ()
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre.js)
if (onnxruntime_USE_JSEP)
@@ -245,6 +380,8 @@ else()
"SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
"SHELL:-s ASYNCIFY=1"
"SHELL:-s ASYNCIFY_STACK_SIZE=65536"
+ "SHELL:-s ASYNCIFY_EXPORTS=['OrtRun']"
+ "SHELL:-s ASYNCIFY_IMPORTS=['Module.jsepCopy','Module.jsepCopyAsync','jsepDownload']"
)
set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
endif()
@@ -281,7 +418,9 @@ else()
endif()
# Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
- target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s DISABLE_EXCEPTION_THROWING=0")
+ if (NOT onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
+ target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s DISABLE_EXCEPTION_THROWING=0")
+ endif()
if (onnxruntime_ENABLE_WEBASSEMBLY_PROFILING)
target_link_options(onnxruntime_webassembly PRIVATE --profiling --profiling-funcs)
diff --git a/cmake/patches/composable_kernel/Add_gfx12x_support.patch b/cmake/patches/composable_kernel/Add_gfx12x_support.patch
new file mode 100644
index 0000000000000..ef529184d2ed8
--- /dev/null
+++ b/cmake/patches/composable_kernel/Add_gfx12x_support.patch
@@ -0,0 +1,2280 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index bc326c8b5..db5ad5052 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -117,7 +117,7 @@ else()
+ add_definitions(-DPROFILER_ONLY)
+ set(GPU_TARGETS "" CACHE STRING "" FORCE)
+ if(GPU_TARGETS)
+- message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, or gfx11")
++ message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, gfx11 or gfx12")
+ endif()
+ if(GPU_ARCH MATCHES "gfx90")
+ rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx908;gfx90a")
+@@ -127,8 +127,10 @@ else()
+ rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
+ elseif(GPU_ARCH MATCHES "gfx11")
+ rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
++ elseif(GPU_ARCH MATCHES "gfx12")
++ rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1200;gfx1201")
+ else()
+- message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, or gfx11")
++ message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, gfx11 or gfx12")
+ endif()
+ set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
+ endif()
+diff --git a/Jenkinsfile b/Jenkinsfile
+index 75800bfc9..b72e2ca4e 100644
+--- a/Jenkinsfile
++++ b/Jenkinsfile
+@@ -493,6 +493,7 @@ def Build_CK(Map conf=[:]){
+
+ def variant = env.STAGE_NAME
+ def retimage
++
+ gitStatusWrapper(credentialsId: "${env.status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') {
+ try {
+ (retimage, image) = getDockerImage(conf)
+@@ -660,9 +661,6 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCM
+
+ pipeline {
+ agent none
+- triggers {
+- parameterizedCron(CRON_SETTINGS)
+- }
+ options {
+ parallelsAlwaysFailFast()
+ }
+diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
+index 8654170b3..42070051b 100644
+--- a/cmake/EnableCompilerWarnings.cmake
++++ b/cmake/EnableCompilerWarnings.cmake
+@@ -66,7 +66,7 @@ else()
+ -Wunreachable-code
+ -Wunused
+ -Wno-reserved-identifier
+- -Werror
++ -Werror
+ -Wno-option-ignored
+ -Wsign-compare
+ -Wno-extra-semi-stmt
+diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp
+index 8c52e4f7d..f8afe8d6d 100644
+--- a/example/01_gemm/gemm_wmma_fp16.cpp
++++ b/example/01_gemm/gemm_wmma_fp16.cpp
+@@ -23,45 +23,45 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
+
+ // clang-format off
+ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
+- < ALayout,
+- BLayout,
+- CLayout,
+- ADataType,
++ < ALayout,
++ BLayout,
++ CLayout,
++ ADataType,
+ BDataType,
+- CDataType,
+- AccDataType,
+- CShuffleDataType,
+- AElementOp,
+- BElementOp,
+- CElementOp,
+- GemmDefault,
++ CDataType,
++ AccDataType,
++ CShuffleDataType,
++ AElementOp,
++ BElementOp,
++ CElementOp,
++ GemmDefault,
+ 1, // Prefetch stage
+ 128, // BlockSize
+ 64, // MPerBlock
+ 128, // NPerBlock
+ 64, // KPerBlock
+- 8, // K1
++ 2, // K1
+ 16, // MPerWmma
+ 16, // NPerWmma
+ 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave
+ 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave
+- S<4, 32, 1>,
+- S<1, 0, 2>,
+- S<1, 0, 2>,
+- 2,
+- 8,
+- 8,
+- true,
+- S<4, 32, 1>,
+- S<1, 0, 2>,
+- S<1, 0, 2>,
+- 2,
+- 8,
+- 8,
+- true,
++ S<4, 32, 1>,
++ S<1, 0, 2>,
++ S<1, 0, 2>,
++ 2,
++ 2,
++ 2,
++ true,
++ S<4, 32, 1>,
++ S<1, 0, 2>,
++ S<1, 0, 2>,
++ 2,
++ 2,
++ 2,
++ true,
+ 1, // C shuffle (M Repeat) Per store
+ 1, // C shuffle (N Repeat) Per store
+- S<1, 32, 1, 4>,
++ S<1, 32, 1, 4>,
+ 8>;
+ // clang-format on
+
+diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
+index b04e4e53a..cb15186c3 100644
+--- a/example/01_gemm/run_gemm_example.inc
++++ b/example/01_gemm/run_gemm_example.inc
+@@ -159,7 +159,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
+ ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n);
+ break;
+ case 4:
+- ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k);
++ ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k);
+ ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n);
+ break;
+ case 5:
+diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+index ab19f819e..be47665a2 100644
+--- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt
++++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
+@@ -24,4 +24,4 @@ foreach(gpu IN LISTS GPU_TARGETS)
+ add_example_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_lds_direct_load_fp32)
+ set(target 1)
+ endif()
+-endforeach()
+\ No newline at end of file
++endforeach()
+diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+index 2bbf430c4..f556be887 100644
+--- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
++++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+@@ -83,14 +83,14 @@ using DeviceOpInstanceKKNN =
+ 2,
+ 4,
+ 4,
+- true,
++ false,
+ S<4, 32, 1>,
+ S<1, 0, 2>,
+ S<1, 0, 2>,
+ 2,
+ 4,
+ 4,
+- true,
++ false,
+ 1,
+ 1,
+ S<1, 64, 1, 2>,
+diff --git a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp
+index 4c92c5497..fac19f8b5 100644
+--- a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp
++++ b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp
+@@ -71,7 +71,7 @@ static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecial
+ #define CK_MHA_USE_WAVE_1
+ #define CK_MHA_USE_WAVE_2
+ #define CK_MHA_USE_WAVE_4
+-#define CK_MHA_USE_WAVE_8
++//#define CK_MHA_USE_WAVE_8
+ using DeviceMHAFactory =
+ std::tuple<
+ #ifdef CK_MHA_USE_WAVE_1
+@@ -277,10 +277,10 @@ using DeviceMHAFactory =
+ S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false,
+ // CShuffleBlockTransfer MN
+ 1, 1, S<1, 64, 1, 2>, 8,
+- MaskingSpec>,
++ MaskingSpec>
+ #endif
+ #ifdef CK_MHA_USE_WAVE_8
+- ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
++ ,ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
+ NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,
+ ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType,
+ AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+diff --git a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp
+index 8e037272b..d463cc871 100644
+--- a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp
++++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp
+@@ -71,7 +71,7 @@ static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecial
+ #define CK_MHA_USE_WAVE_1
+ #define CK_MHA_USE_WAVE_2
+ #define CK_MHA_USE_WAVE_4
+-#define CK_MHA_USE_WAVE_8
++//#define CK_MHA_USE_WAVE_8
+ using DeviceMHAFactory =
+ std::tuple<
+ #ifdef CK_MHA_USE_WAVE_1
+@@ -277,10 +277,10 @@ using DeviceMHAFactory =
+ S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false,
+ // CShuffleBlockTransfer MN
+ 1, 1, S<1, 64, 1, 2>, 8,
+- MaskingSpec>,
++ MaskingSpec>
+ #endif
+ #ifdef CK_MHA_USE_WAVE_8
+- ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
++ ,ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle<
+ NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,
+ ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType,
+ AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp,
+diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
+index 5465adb77..7534bff3b 100644
+--- a/example/CMakeLists.txt
++++ b/example/CMakeLists.txt
+@@ -60,7 +60,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME)
+ endforeach()
+ #Do not build any WMMA examples if gfx11 targets are not on the list
+ foreach(source IN LISTS FILE_NAME)
+- if(NOT GPU_TARGETS MATCHES "gfx11" AND source MATCHES "_wmma")
++ if(NOT GPU_TARGETS MATCHES "gfx11" AND NOT GPU_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+ message("removing wmma example ${source} ")
+ list(REMOVE_ITEM FILE_NAME "${source}")
+ endif()
+@@ -134,7 +134,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
+ endforeach()
+ #Do not build any WMMA examples if gfx11 targets are not on the list
+ foreach(source IN LISTS FILE_NAME)
+- if(NOT GPU_TARGETS MATCHES "gfx11" AND source MATCHES "_wmma")
++ if(NOT GPU_TARGETS MATCHES "gfx11" AND NOT GPU_TARGETS MATCHES "gfx12" AND source MATCHES "_wmma")
+ message("removing wmma example ${source} ")
+ list(REMOVE_ITEM FILE_NAME "${source}")
+ endif()
+diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
+index 55f562061..69a7abf62 100644
+--- a/include/ck/ck.hpp
++++ b/include/ck/ck.hpp
+@@ -69,6 +69,9 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__)
+ #define __gfx11__
+ #endif
++#if defined(__gfx1200__) || defined(__gfx1201__)
++#define __gfx12__
++#endif
+
+ // buffer resource
+ #ifndef __HIP_DEVICE_COMPILE__ // for host code
+@@ -77,7 +80,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+ #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+ #elif defined(__gfx103__)
+ #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+-#elif defined(__gfx11__)
++#elif defined(__gfx11__) || defined(__gfx12__)
+ #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000
+ #endif
+
+@@ -89,7 +92,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+ #define CK_USE_AMD_V_FMAC_F32
+ #define CK_USE_AMD_V_DOT2_F32_F16
+ #define CK_USE_AMD_V_DOT4_I32_I8
+-#elif defined(__gfx11__)
++#elif defined(__gfx11__) || defined(__gfx12__)
+ #define CK_USE_AMD_V_FMAC_F32
+ #define CK_USE_AMD_V_DOT2_F32_F16
+ #define CK_USE_AMD_V_DOT4_I32_I8_GFX11
+@@ -110,13 +113,6 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+ #define CK_USE_AMD_MFMA_GFX940
+ #endif
+
+-// WMMA instruction
+-#ifndef __HIP_DEVICE_COMPILE__ // for host code
+-#define CK_USE_AMD_WMMA
+-#elif defined(__gfx11__) // for GPU code
+-#define CK_USE_AMD_WMMA
+-#endif
+-
+ // buffer load
+ #define CK_USE_AMD_BUFFER_LOAD 1
+
+diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
+index 116bb3ea0..83af2efe8 100644
+--- a/include/ck/host_utility/device_prop.hpp
++++ b/include/ck/host_utility/device_prop.hpp
+@@ -84,4 +84,9 @@ inline bool is_gfx11_supported()
+ ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103";
+ }
+
++inline bool is_gfx12_supported()
++{
++ return ck::get_device_name() == "gfx1200" || ck::get_device_name() == "gfx1201";
++}
++
+ } // namespace ck
+diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+index f8ee283c6..7eb7d42eb 100644
+--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+@@ -13,6 +13,504 @@
+
+ namespace ck {
+
++#ifdef __gfx12__
++template
++/* Option: Read from LDS, big buffer hold all threads required data
++ * Source
++ * A: K0PerBlock x MPerBlock x K1
++ * B: K0PerBlock x NPerBlock x K1
++ * Destination
++ * C, non-transpose
++ * thread level: MRepeat x NRepeat x MAccVgprs
++ * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
++ * KPACK == WMMA_K = 16
++ *
++ * Option: Read from VMEM, small buffer hold each thread own required data (Skip LDS)
++ * Source:
++ * A(if skip LDS): MRepeat x KPack
++ * B(if skip LDS): NRepeat x KPack
++ * Destination
++ * C, non-transpose
++ * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs
++ */
++struct BlockwiseGemmWMMA
++{
++ static constexpr auto I0 = Number<0>{};
++ static constexpr auto I1 = Number<1>{};
++ static constexpr auto I2 = Number<2>{};
++ static constexpr auto I3 = Number<3>{};
++ static constexpr auto I4 = Number<4>{};
++ static constexpr auto I5 = Number<5>{};
++ static constexpr auto WmmaK = Number<16>{};
++
++ using ThisThreadBlock = ThisThreadBlock;
++
++ // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one.
++ static constexpr index_t WaveSize = 32;
++
++ // When use LDS, each Row(16 consecutive lanes) read whole data from source buffer
++ // When not use LDS, each Row read half of whole data from source buffer, exchange the data via
++ // permutation
++ static constexpr index_t A_KRow = 2;
++ static constexpr index_t B_KRow = 2;
++
++ static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5);
++ static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5);
++
++ static constexpr auto wmma_gemm =
++ WmmaGemm{};
++
++ static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA);
++ static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA);
++
++ StaticBufferTupleOfVector
++ c_thread_buf_;
++
++ __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
++
++ __device__ static auto GetWaveIdx()
++ {
++ const index_t thread_id = ThisThreadBlock::GetThreadId();
++
++ constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
++ make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
++ make_tuple(Sequence<0, 1, 2>{}),
++ make_tuple(Sequence<0>{}));
++
++ return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
++ }
++
++ // Default, Block buffer in LDS, thread level offset enabled
++ __device__ static auto CalculateAThreadOriginDataIndex()
++ {
++ if constexpr(AEnableLds)
++ {
++ const auto wave_idx = GetWaveIdx();
++ const auto waveId_m = wave_idx[I0];
++ const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex();
++
++ // |KRepeat |MRepeat|MWave |KRow |MLane |KPack
++ return make_tuple(0, 0, waveId_m, wmma_gemm.GetSubGroupId(), WMMA_a_idx, 0);
++ }
++ else
++ {
++ return make_tuple(0, 0, 0, 0, 0, 0);
++ }
++ }
++
++ __device__ static auto CalculateBThreadOriginDataIndex()
++ {
++ if constexpr(BEnableLds)
++ {
++ const auto wave_idx = GetWaveIdx();
++ const auto waveId_n = wave_idx[I1];
++ const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex();
++
++ // |KRepeat |NRepeat|Nwave |KRow |NLane |KPack
++ return make_tuple(0, 0, waveId_n, wmma_gemm.GetSubGroupId(), WMMA_b_idx, 0);
++ }
++ else
++ {
++ return make_tuple(0, 0, 0, 0, 0, 0);
++ }
++ }
++
++ template
++ __device__ static auto CalculateCThreadOriginDataIndex(Number, Number)
++ {
++ const auto wave_idx = GetWaveIdx();
++
++ const auto waveId_m = wave_idx[I0];
++ const auto waveId_n = wave_idx[I1];
++
++ const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk();
++
++ constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor(
++ make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))),
++ make_tuple(Sequence<0>{}),
++ make_tuple(Sequence<0, 1, 2>{}));
++
++ constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor(
++ make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))),
++ make_tuple(Sequence<0>{}),
++ make_tuple(Sequence<0, 1, 2>{}));
++
++ const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex(
++ make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
++ const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex(
++ make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
++
++ return make_tuple(c_thread_m, c_thread_n);
++ }
++
++ template
++ __device__ static auto CalculateCThreadOriginDataIndex7D(Number, Number)
++ {
++ const auto wave_idx = GetWaveIdx();
++
++ const auto waveId_m = wave_idx[I0];
++ const auto waveId_n = wave_idx[I1];
++
++ const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D();
++
++ return make_tuple(
++ Number{}, waveId_m, blk_idx[I0], Number{}, waveId_n, blk_idx[I1], blk_idx[I2]);
++ }
++
++ using Tuple6 = decltype(CalculateAThreadOriginDataIndex());
++ __host__ __device__ BlockwiseGemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(),
++ Tuple6 b_origin = CalculateBThreadOriginDataIndex())
++ : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
++ {
++ static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
++ "wrong! Desc should be known at compile-time");
++
++ static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
++ "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
++
++ static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 &&
++ NPerBlock % (NPerWMMA * NRepeat) == 0,
++ "wrong!");
++ }
++
++ // transposed WMMA output C' = B' * A'
++ __host__ __device__ static constexpr auto
++ GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs()
++ {
++ constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
++ wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
++
++ constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
++
++ return make_naive_tensor_descriptor_packed(
++ // |MRepeat |MWave |MSubGroup |NRepeat |NWave
++ // |NThreadPerSubGroup |MAccVgprs
++ make_tuple(Number{}, I1, I1, Number{}, I1, I1, NAccVgprs));
++ }
++
++ // Thread level, register decriptor. Vector-write
++ __host__ __device__ static constexpr auto
++ GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
++ {
++ constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens =
++ wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths();
++
++ constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2];
++ constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3];
++ return make_naive_tensor_descriptor(
++ // |MRepeat |MWave |MSubGroup |NRepeat |NWave
++ // |NThreadPerSubGroup |MAccVgprs
++ make_tuple(Number{}, I1, I1, Number{}, I1, I1, MAccVgprs),
++ make_tuple(Number{} * MAccVgprs * AccStride,
++ Number{} * MAccVgprs * AccStride,
++ Number{} * MAccVgprs * AccStride,
++ MAccVgprs * AccStride,
++ MAccVgprs * AccStride,
++ MAccVgprs * AccStride,
++ AccStride));
++ }
++
++ template
++ __host__ __device__ static constexpr auto
++ MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
++ const CGridDesc_M_N& c_grid_desc_m_n)
++ {
++ const auto M = c_grid_desc_m_n.GetLength(I0);
++ const auto N = c_grid_desc_m_n.GetLength(I1);
++
++ const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma =
++ transform_tensor_descriptor(
++ c_grid_desc_m_n,
++ make_tuple(
++ make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)),
++ make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))),
++ make_tuple(Sequence<0>{}, Sequence<1>{}),
++ make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
++
++ return wmma_gemm
++ .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
++ c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma);
++ }
++
++ // transposed WMMA output C' = B' * A'
++ __host__ __device__ static constexpr auto
++ GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs()
++ {
++ constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
++ make_naive_tensor_descriptor_packed(make_tuple(Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{}));
++
++ return wmma_gemm
++ .MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs(
++ c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
++ }
++
++ // Provide dimension size
++ __host__ __device__ static constexpr auto
++ GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs()
++ {
++ constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma =
++ make_naive_tensor_descriptor_packed(make_tuple(Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{}));
++
++ return wmma_gemm
++ .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(
++ c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma);
++ }
++
++ // Describe how data allocated in thread copy src buffer
++ // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma
++ static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1;
++ static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1;
++
++ template
++ __device__ void Run(const ABlockBuffer& a_block_buf,
++ const BBlockBuffer& b_block_buf,
++ CThreadBuffer& c_thread_buf) const
++ {
++ auto a_thread_buf = make_static_buffer(
++ a_thread_desc_.GetElementSpaceSize());
++ auto b_thread_buf = make_static_buffer(
++ b_thread_desc_.GetElementSpaceSize());
++
++ static_assert(KPack % (A_K1 * A_KRow) == 0, "");
++ static_assert(KPack % (B_K1 * B_KRow) == 0, "");
++
++ // basic intrinsic to determine loopover direction
++ if constexpr(MRepeat < NRepeat)
++ {
++ static_for<0, KPerBlock / KPack, 1>{}(
++ [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
++ static_for<0, MRepeat, 1>{}([&](auto m0) {
++ // read A
++ a_thread_copy_.Run(
++ a_block_desc_k0_m0_m1_m2_k1,
++ make_tuple(Number{}, m0, I0, I0, I0, I0),
++ a_block_buf,
++ a_thread_desc_,
++ make_tuple(I0, m0, I0, I0, I0, I0),
++ a_thread_buf);
++
++ static_for<0, NRepeat, 1>{}([&](auto n0) {
++ // read B
++ b_thread_copy_.Run(
++ b_block_desc_k0_n0_n1_n2_k1,
++ make_tuple(Number{}, n0, I0, I0, I0, I0),
++ b_block_buf,
++ b_thread_desc_,
++ make_tuple(I0, n0, I0, I0, I0, I0),
++ b_thread_buf);
++
++ vector_type a_thread_vec;
++ vector_type b_thread_vec;
++
++ static_for<0, KPack / A_KRow, 1>{}([&](auto i) {
++ a_thread_vec.template AsType()(i) =
++ a_thread_buf[Number{}];
++ });
++
++ static_for<0, KPack / B_KRow, 1>{}([&](auto i) {
++ b_thread_vec.template AsType()(i) =
++ b_thread_buf[Number{}];
++ });
++
++ using wmma_input_type_a =
++ typename vector_type::type;
++ using wmma_input_type_b =
++ typename vector_type::type;
++
++ constexpr index_t c_offset =
++ c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
++
++ wmma_gemm.template Run(
++ a_thread_vec.template AsType(),
++ b_thread_vec.template AsType(),
++ c_thread_buf.GetVectorTypeReference(Number{}));
++ });
++ });
++ });
++ }
++ else
++ {
++ static_for<0, NRepeat, 1>{}([&](auto n0) {
++ static_for<0, MRepeat, 1>{}([&](auto m0) {
++ static_for<0, KPerBlock / KPack, 1>{}([&](auto k) { // k=0,1,2 instead of
++ // k=0,kpack*1, ..
++ // read B
++ b_thread_copy_.Run(
++ b_block_desc_k0_n0_n1_n2_k1,
++ make_tuple(Number{}, n0, I0, I0, I0, I0),
++ b_block_buf,
++ b_thread_desc_,
++ make_tuple(I0, n0, I0, I0, I0, I0),
++ b_thread_buf);
++ // read A
++ a_thread_copy_.Run(
++ a_block_desc_k0_m0_m1_m2_k1,
++ make_tuple(Number{}, m0, I0, I0, I0, I0),
++ a_block_buf,
++ a_thread_desc_,
++ make_tuple(I0, m0, I0, I0, I0, I0),
++ a_thread_buf);
++
++ vector_type a_thread_vec;
++ vector_type b_thread_vec;
++
++ static_for<0, KPack / A_KRow, 1>{}([&](auto i) {
++ a_thread_vec.template AsType()(i) =
++ a_thread_buf[Number{}];
++ });
++
++ static_for<0, KPack / B_KRow, 1>{}([&](auto i) {
++ b_thread_vec.template AsType()(i) =
++ b_thread_buf[Number{}];
++ });
++
++ using wmma_input_type_a =
++ typename vector_type::type;
++ using wmma_input_type_b =
++ typename vector_type::type;
++
++ constexpr index_t c_offset =
++ c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
++
++ wmma_gemm.template Run(
++ a_thread_vec.template AsType(),
++ b_thread_vec.template AsType(),
++ c_thread_buf.GetVectorTypeReference(Number{}));
++ });
++ });
++ });
++ }
++ }
++
++ protected:
++ static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
++ make_tuple(Number{}, Number{}, I1, I1, I1, Number{}),
++ make_tuple(Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number<1>{}));
++
++ static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
++ make_tuple(Number{}, Number{}, I1, I1, I1, Number{}),
++ make_tuple(Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number{},
++ Number<1>{}));
++
++ // C[M, N, NumRegWMMA]
++ static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
++ make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma()));
++
++ template
++ struct AThreadCopySelector;
++
++ template <>
++ struct AThreadCopySelector
++ {
++ using type =
++ ThreadwiseTensorSliceTransfer_v4,
++ Sequence<0, 1, 2, 3, 4, 5>,
++ 5,
++ A_K1,
++ A_K1>;
++ };
++
++ template <>
++ struct AThreadCopySelector
++ {
++ using type = ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow<
++ FloatA,
++ FloatA,
++ decltype(a_block_desc_k0_m0_m1_m2_k1),
++ decltype(a_thread_desc_),
++ tensor_operation::element_wise::PassThrough,
++ Sequence,
++ Sequence<0, 1, 2, 3, 4, 5>,
++ 5,
++ A_K1,
++ false>;
++ };
++
++ template
++ struct BThreadCopySelector;
++
++ template <>
++ struct BThreadCopySelector
++ {
++ using type =
++ ThreadwiseTensorSliceTransfer_v4,
++ Sequence<0, 1, 2, 3, 4, 5>,
++ 5,
++ B_K1,
++ B_K1>;
++ };
++
++ template <>
++ struct BThreadCopySelector
++ {
++ using type = ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow<
++ FloatB,
++ FloatB,
++ decltype(b_block_desc_k0_n0_n1_n2_k1),
++ decltype(b_thread_desc_),
++ tensor_operation::element_wise::PassThrough,
++ Sequence,
++ Sequence<0, 1, 2, 3, 4, 5>,
++ 5,
++ B_K1,
++ false>;
++ };
++
++ typename AThreadCopySelector::type a_thread_copy_;
++ typename BThreadCopySelector::type b_thread_copy_;
++};
++#else
+ template ::type a_thread_copy_;
+ typename BThreadCopySelector::type b_thread_copy_;
+ };
++#endif
+
+ } // namespace ck
+diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+index e5e6245cb..1f7d50429 100644
+--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
++++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+@@ -488,7 +488,14 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+ // sync point.
+ if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
+ {
++#ifdef __gfx12__
++ asm volatile("\
++ s_barrier_signal -1 \n \
++ s_barrier_wait -1 \
++ " ::);
++#else
+ asm volatile("s_barrier" ::);
++#endif
+ __builtin_amdgcn_sched_barrier(0);
+ }
+ static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+index a15759559..ab3f3856a 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+@@ -133,8 +133,13 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
+ static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
+ static constexpr auto WmmaK = K1 == 16 ? 32 : 16;
+
+- static constexpr auto AEnableLds_auto = NWaves == 1 ? false : true;
+- static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true;
++ static constexpr auto MaxVectorLoadA = K1 * sizeof(ADataType) == 16 ? true : false;
++ static constexpr auto MaxVectorLoadB = K1 * sizeof(BDataType) == 16 ? true : false;
++
++ static constexpr auto AEnableLds_auto =
++ (NWaves == 1 && (MaxVectorLoadA || MRepeat == 1)) ? false : true;
++ static constexpr auto BEnableLds_auto =
++ (MWaves == 1 && (MaxVectorLoadB || NRepeat == 1)) ? false : true;
+
+ // If true, LDS is used unconditionally
+ static constexpr auto AEnableLds_manu = false;
+@@ -829,7 +834,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
+
+ static bool IsSupportedArgument(const Argument& arg)
+ {
+- if(ck::is_gfx11_supported())
++ if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+ {
+ if constexpr(!(is_same_v || is_same_v))
+ {
+@@ -869,11 +874,15 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
+ }
+ else
+ {
+- if(!(arg.a_kz_stride_ == 1 &&
+- arg.a_grid_desc_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
++ if(!(arg.a_kz_stride_ == 1))
+ {
+- printf("DeviceOp: Vector Access A-k check failure\n");
+- return false;
++ index_t LastK =
++ AEnableLds ? arg.a_grid_desc_.GetLength(I2) : arg.a_grid_desc_.GetLength(I6);
++ if(LastK % ABlockTransferSrcScalarPerVector == 0)
++ {
++ printf("DeviceOp: Vector Access A-k check failure\n");
++ return false;
++ }
+ }
+ }
+
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+index 8fd14afc0..1b487502f 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp
+@@ -70,8 +70,9 @@ __global__ void
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+ const Block2CTileMap block_2_ctile_map)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+- defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
++ defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
++ defined(__gfx12__))
+
+ const index_t num_blocks_per_batch =
+ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+@@ -648,7 +649,7 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+index 9d5b74be6..017d28641 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+@@ -601,9 +601,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
+ return false;
+ }
+
+- if(ck::get_device_name() != "gfx90a" && ck::get_device_name() != "gfx940" &&
+- ck::get_device_name() != "gfx941" && ck::get_device_name() != "gfx942" &&
+- std::is_same::value)
++ if(!ck::is_lds_direct_load_supported() && std::is_same::value)
+ {
+ return false;
+ }
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+index b84e18130..1edae33be 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
+@@ -1393,7 +1393,7 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl
+ {
+ // check device
+ if(!(ck::get_device_name() == "gfx906" || ck::is_gfx103_supported() ||
+- ck::is_gfx11_supported()))
++ ck::is_gfx11_supported() || ck::is_gfx12_supported()))
+ {
+ return false;
+ }
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
+index bf96324d0..553143e28 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp
+@@ -509,7 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB || is_same_v ||
+ is_same_v))
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+index b1784b385..eb0fb55f5 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+@@ -536,7 +536,7 @@ struct DeviceGemmDl : public DeviceGemm || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+index 93ab8a7e1..a7cc546f5 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+@@ -84,14 +84,21 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{};
+
+- static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma);
+- static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
+- static constexpr auto WmmaK = K1 == 16 ? 32 : 16;
+-
+- static constexpr auto AEnableLds_auto =
+- (NWaves == 1 && is_same::value) ? false : true;
++ static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma);
++ static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
++ static constexpr auto WmmaK = K1 == 16 ? 32 : 16;
++ static constexpr auto MaxVectorLoadA = K1 * sizeof(ADataType) == 16 ? true : false;
++ static constexpr auto MaxVectorLoadB = K1 * sizeof(BDataType) == 16 ? true : false;
++
++ static constexpr auto AEnableLds_auto = (NWaves == 1 && (MaxVectorLoadA || MRepeat == 1) &&
++ is_same::value)
++ ? false
++ : true;
+ static constexpr auto BEnableLds_auto =
+- (MWaves == 1 && is_same::value) ? false : true;
++ (MWaves == 1 && (MaxVectorLoadB || NRepeat == 1) &&
++ is_same::value)
++ ? false
++ : true;
+
+ // If true, LDS is used unconditionally
+ static constexpr auto AEnableLds_manu = false;
+@@ -443,7 +450,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm || is_same_v ||
+ is_same_v))
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+index 6f74838fb..6bb5d431c 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+@@ -629,7 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
+ static bool IsSupportedArgument(const Argument& arg)
+ {
+ // check device
+- if(ck::is_gfx11_supported())
++ if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+ {
+ if constexpr(!(is_same_v || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+index bd264a3c8..7047e1bda 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_dl.hpp
+@@ -48,8 +48,9 @@ __global__ void
+ const Block2CTileMap block_2_ctile_map,
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
+- defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
++ defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
++ defined(__gfx12__))
+ const index_t num_blocks_per_batch =
+ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+ const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
+index 211185dfb..5738be0fb 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
+@@ -692,7 +692,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
+ static bool IsSupportedArgument(const Argument& arg)
+ {
+ // check device
+- if(ck::is_gfx11_supported())
++ if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+ {
+ if constexpr(!(is_same_v || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+index 7cfbd8a8f..5d5a9de7d 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+@@ -90,8 +90,9 @@ __global__ void
+ const Block2CTileMap block_2_ctile_map,
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
+- defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
++ defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx94__) || defined(__gfx11__) || \
++ defined(__gfx12__))
+ // offset base pointer for each work-group
+ const index_t num_blocks_per_batch =
+ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+@@ -666,7 +667,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+
+ // check device
+ if(!(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() ||
+- ck::is_gfx103_supported() || ck::is_gfx11_supported()))
++ ck::is_gfx103_supported() || ck::is_gfx11_supported() || ck::is_gfx12_supported()))
+ {
+ return false;
+ }
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+index 6a4d97d7d..c65370b51 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+@@ -107,7 +107,7 @@ __global__ void
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+ {
+ #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx103__) || \
+- defined(__gfx11__))
++ defined(__gfx11__) || defined(__gfx12__))
+ // offset base pointer for each work-group
+ const index_t num_blocks_per_batch =
+ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+@@ -602,7 +602,7 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+index ac392cddc..060a16d1e 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+@@ -39,8 +39,9 @@ __global__ void
+ const BElementwiseOperation b_element_op,
+ const CDEElementwiseOperation cde_element_op)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+- defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
++ defined(__gfx90a__) || defined(__gfx103__) || defined(__gfx11__) || defined(__gfx94__) || \
++ defined(__gfx12__))
+ __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+ const index_t block_id = get_block_1d_id();
+@@ -673,7 +674,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+index 4e14ed3a5..cc88c1a10 100644
+--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+@@ -60,7 +60,7 @@ __global__ void
+ bool input_permute,
+ bool output_permute)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+
+ // clang-format off
+ // ***************************************************
+@@ -165,6 +165,7 @@ __global__ void
+ ignore = O;
+ ignore = G0;
+ ignore = G1;
++ ignore = alpha;
+ ignore = input_permute;
+ ignore = output_permute;
+ #endif // end of if (defined(__gfx11__))
+@@ -594,7 +595,7 @@ struct DeviceMultiQueryAttentionForward_Wmma
+
+ static bool IsSupportedArgument(const RawArg& arg)
+ {
+- if(ck::is_gfx11_supported())
++ if(ck::is_gfx11_supported() || ck::is_gfx12_supported())
+ {
+ if constexpr(!(is_same_v || is_same_v))
+ {
+diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
+index 16717ff81..1754e07e6 100644
+--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
+@@ -371,12 +371,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
+ if constexpr(B0EnableLds)
+ {
+ // BK0_L_BK1 -> BK0_LRepeat_Lwaves_LPerWmma_BK1
+- constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0);
+- constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2);
++ constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0);
++ constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto B_KRow = I2;
++#else
+ constexpr auto B_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ B0BlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -428,12 +432,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
+ if constexpr(B1EnableLds)
+ {
+ // BL0_N_BL1 -> BL0_NRepeat_Nwaves_NPerWmma_BL1
+- constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0);
+- constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2);
++ constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0);
++ constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto B_LRow = I2;
++#else
+ constexpr auto B_LRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ B1BlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, B_LRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, B_LRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+index 499eb7eb0..21dac6f9e 100644
+--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+@@ -50,7 +50,7 @@ __global__ void
+ const CElementwiseOperation c_element_op,
+ const Block2CTileMap block_2_ctile_map)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+ __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
+
+ GridwiseGemm::template Run(p_a_grid,
+@@ -302,12 +302,16 @@ struct GridwiseFpAintBGemm_Wmma
+ if constexpr(AEnableLds)
+ {
+ // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
+- constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
+- constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++ constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
++ constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto A_KRow = I2;
++#else
+ constexpr auto A_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ ABlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -360,12 +364,16 @@ struct GridwiseFpAintBGemm_Wmma
+ if constexpr(BEnableLds)
+ {
+ // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
+- constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
+- constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++ constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
++ constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto B_KRow = I2;
++#else
+ constexpr auto B_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ BBlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+index 82d010a99..fdda649ef 100644
+--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
++++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+@@ -54,7 +54,7 @@ __global__ void
+ const Block2CTileMap block_2_ctile_map,
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+ // offset base pointer for each work-group
+ const index_t num_blocks_per_batch =
+ __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+@@ -147,7 +147,7 @@ __global__ void
+ const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+ const Block2CTileMap block_2_etile_map)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+ // printf("entry kernel launch");
+ __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
+
+@@ -237,7 +237,7 @@ __global__ void
+ const CDEElementwiseOperation cde_element_op,
+ const Block2CTileMap block_2_ctile_map)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+ __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
+
+ GridwiseOp::template Run(p_a_grid,
+@@ -375,8 +375,9 @@ struct GridwiseGemmMultipleD_Wmma
+ }
+ else
+ {
++ constexpr auto A_KRow = I2;
+ constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
+- constexpr auto K0PerWmma = WmmaK / 2 / K1;
++ constexpr auto K0PerWmma = WmmaK / A_KRow / K1;
+ // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
+ return make_naive_tensor_descriptor(
+ make_tuple(Number{},
+@@ -422,8 +423,9 @@ struct GridwiseGemmMultipleD_Wmma
+ }
+ else
+ {
++ constexpr auto B_KRow = I2;
+ constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
+- constexpr auto K0PerWmma = WmmaK / 2 / K1;
++ constexpr auto K0PerWmma = WmmaK / B_KRow / K1;
+ // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
+ return make_naive_tensor_descriptor(
+ make_tuple(Number{},
+@@ -495,12 +497,16 @@ struct GridwiseGemmMultipleD_Wmma
+ if constexpr(AEnableLds)
+ {
+ // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
+- constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
+- constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++ constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
++ constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto A_KRow = I2;
++#else
+ constexpr auto A_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ ABlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -534,12 +540,16 @@ struct GridwiseGemmMultipleD_Wmma
+ if constexpr(BEnableLds)
+ {
+ // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
+- constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
+- constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++ constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
++ constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto B_KRow = I2;
++#else
+ constexpr auto B_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ BBlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -571,15 +581,12 @@ struct GridwiseGemmMultipleD_Wmma
+ // *Caution Here repeat is shuffle repeat
+ GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
+ {
+- constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma);
+- constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma);
+-
+ constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
+ make_naive_tensor_descriptor_packed(
+ make_tuple(I1,
+- Number{},
++ Number{},
+ I1,
+- Number{}));
++ Number{}));
+
+ return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
+ }
+@@ -799,8 +806,9 @@ struct GridwiseGemmMultipleD_Wmma
+ const auto M = e_grid_desc_m_n.GetLength(I0);
+ const auto N = e_grid_desc_m_n.GetLength(I1);
+
+- const auto MBlock = M / MPerBlock;
+- const auto NBlock = N / NPerBlock;
++ const auto MBlock = M / MPerBlock;
++ const auto NBlock = N / NPerBlock;
++
+ const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+ e_grid_desc_m_n,
+ make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})),
+diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+index 8e4117593..4458b9356 100644
+--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
++++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+@@ -45,7 +45,7 @@ __global__ void
+ const CElementwiseOperation c_element_op,
+ const Block2CTileMap block_2_ctile_map)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+ __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size];
+
+ GridwiseGemm::template Run(p_a_grid,
+@@ -170,8 +170,9 @@ struct GridwiseGemm_Wmma
+ }
+ else
+ {
++ constexpr auto A_KRow = I2;
+ constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
+- constexpr auto K0PerWmma = WmmaK / 2 / K1;
++ constexpr auto K0PerWmma = WmmaK / A_KRow / K1;
+ // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
+ return make_naive_tensor_descriptor(
+ make_tuple(Number{},
+@@ -217,8 +218,10 @@ struct GridwiseGemm_Wmma
+ }
+ else
+ {
++
++ constexpr auto B_KRow = I2;
+ constexpr auto KWmmaPerblock = KPerBlock / WmmaK;
+- constexpr auto K0PerWmma = WmmaK / 2 / K1;
++ constexpr auto K0PerWmma = WmmaK / B_KRow / K1;
+ // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread
+ return make_naive_tensor_descriptor(
+ make_tuple(Number{},
+@@ -290,12 +293,17 @@ struct GridwiseGemm_Wmma
+ if constexpr(AEnableLds)
+ {
+ // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
+- constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
+- constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++ constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
++ constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto A_KRow = I2;
++#else
+ constexpr auto A_KRow = I1;
++#endif
++
+ return transform_tensor_descriptor(
+ ABlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -348,12 +356,16 @@ struct GridwiseGemm_Wmma
+ if constexpr(BEnableLds)
+ {
+ // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1
+- constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
+- constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++ constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0);
++ constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2);
++#ifdef __gfx12__
++ constexpr auto B_KRow = I2;
++#else
+ constexpr auto B_KRow = I1;
++#endif
+ return transform_tensor_descriptor(
+ BBlockDesc_{},
+- make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
++ make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)),
+ make_unmerge_transform(make_tuple(
+ Number{}, Number{}, Number{})),
+ make_pass_through_transform(Number{})),
+@@ -522,12 +534,6 @@ struct GridwiseGemm_Wmma
+ c_grid_desc_m_n);
+ }
+
+- using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
+- remove_cvref_t;
+- using DefaultBlock2CTileMap =
+- remove_cvref_t;
+-
+ struct SharedMemTrait
+ {
+ // LDS allocation for A and B: be careful of alignment
+@@ -559,6 +565,12 @@ struct GridwiseGemm_Wmma
+ b_block_space_size_aligned * sizeof(BDataType));
+ };
+
++ using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
++ remove_cvref_t;
++ using DefaultBlock2CTileMap =
++ remove_cvref_t;
++
+ template
+ __device__ static void Run(const ADataType* __restrict__ p_a_grid,
+ const BDataType* __restrict__ p_b_grid,
+diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+index 6772524e0..174074990 100644
+--- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
++++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+@@ -35,8 +35,9 @@ __global__ void
+ const Block2ETileMap block_2_tile_map,
+ const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
+ {
+-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
+- defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__))
++#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
++ defined(__gfx90a__) || defined(__gfx94__) || defined(__gfx103__) || defined(__gfx11__) || \
++ defined(__gfx12__))
+ GridwiseTensorRearrangeKernel::Run(in_grid_desc,
+ p_in_global,
+ out_grid_desc,
+diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+index bcce930fc..d7a6a3624 100644
+--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
++++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+@@ -1304,7 +1304,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
+ ElementwiseOperation element_op_;
+ };
+
+-// Specilized for WMMA
++// Specilized for WMMA-Navi3
+ // A single Wave32 is composed by double row
+ // Data exchange allowed between these two rows
+ // This RowLane Dst buf will be filled from two Src buf
+@@ -1439,4 +1439,111 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
+ ElementwiseOperation element_op_{};
+ };
+
++// Specilized for WMMA-Navi4
++template ::type = false>
++struct ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow
++{
++ static constexpr index_t nDim = SliceLengths::Size();
++
++ using Index = MultiIndex;
++
++ __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow(const Index& src_idx)
++ {
++ static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
++ "wrong! Desc need to known at compile-time");
++
++ static_assert(SliceLengths::At(Number{}) % DstScalarPerVector == 0,
++ "wrong! Not divisible");
++ ignore = src_idx;
++ }
++
++ template
++ __device__ void Run(const SrcDesc&,
++ const SrcSliceOriginIdx&,
++ const SrcBuffer& src_buf,
++ const DstDesc&,
++ const DstSliceOriginIdx&,
++ DstBuffer& dst_buf) const
++ {
++ static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
++ "wrong! Desc need to known at compile-time");
++
++ static_assert(is_known_at_compile_time>::value &&
++ is_known_at_compile_time>::value,
++ "wrong! SliceOrigin need to known at compile-time");
++
++ static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
++ "wrong! Buffer need to be StaticBuffer");
++
++ // SrcDesc and src_slice_origin_idx are known at compile-time
++ constexpr auto src_desc = remove_cvref_t{};
++ constexpr auto dst_desc = remove_cvref_t{};
++ constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
++ constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
++
++ // scalar per access on each dim
++ constexpr auto dst_scalar_per_access = generate_sequence(
++ detail::lambda_scalar_per_access{}, Number{});
++
++ constexpr auto dst_scalar_step_in_vector =
++ generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{});
++
++ using SpaceFillingCurve = SpaceFillingCurve>;
++
++ static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
++ "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
++
++ constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
++
++ static_for<0, num_access, 1>{}([&](auto idx_1d) {
++ constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
++
++ // copy data from src_buf into dst_vector
++ static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
++ // src_desc error, non constexpr, caused by merge transform
++ constexpr index_t src_offset = src_desc.CalculateOffset(
++ src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
++
++ constexpr index_t dst_offset = dst_desc.CalculateOffset(
++ dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
++
++ SrcData v_this_row;
++ // int type temp value due to intrinsic requirement
++ int temp = 0;
++
++ // apply element-wise operation
++ element_op_(v_this_row, src_buf[Number{}]);
++
++ // apply intra-row permute.
++ if constexpr(IntraRowSwizzlePerm)
++ {
++ temp = __builtin_amdgcn_permlane16(
++ temp, type_convert_sp(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0);
++ v_this_row = type_convert_sp(temp);
++ }
++
++ // apply type convert
++ dst_buf(Number{}) = type_convert_sp(v_this_row);
++ });
++ });
++ }
++ ElementwiseOperation element_op_{};
++};
++
+ } // namespace ck
+diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+index 565195f53..9a9ebf559 100644
+--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
++++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+@@ -11,12 +11,17 @@ namespace ck {
+
+ enum struct WmmaInstr
+ {
++ // gfx11
+ wmma_f32_16x16x16_f16 = 0,
+ wmma_f32_16x16x16_bf16,
+ wmma_f16_16x16x16_f16,
+ wmma_bf16_16x16x16_bf16,
+ wmma_i32_16x16x16_iu8,
+- wmma_i32_16x16x16_iu4
++ wmma_i32_16x16x16_iu4,
++ // gfx12
++ wmma_f32_16x16x16_f16_gfx12,
++ wmma_f32_16x16x16_bf16_gfx12,
++ wmma_i32_16x16x16_iu8_gfx12,
+ };
+
+ /*
+@@ -279,6 +284,122 @@ struct wmma_type
++struct wmma_type>
++{
++ // Absolute fixing property
++ // * Data Pixel
++ static constexpr index_t m_per_wmma = 16;
++ static constexpr index_t n_per_wmma = 16;
++ static constexpr index_t k_per_wmma = 16;
++ // static constexpr index_t src_a_data_size = 2;
++ // static constexpr index_t src_b_data_size = 2;
++ // static constexpr index_t acc_data_size = 4;
++ // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
++ static constexpr index_t acc_data_size = 4;
++ static constexpr index_t acc_pack_number = 1;
++ static constexpr index_t num_thread_per_subgroups = n_per_wmma;
++
++ // Wave mode dependent propety
++ static constexpr index_t wave_size = Number{};
++ // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
++ // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
++ // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
++ // * num_acc_vgprs_per_wave alone M direction
++ // * num_subgroups alone M direction
++ static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
++ static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
++
++ template
++ __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
++ {
++ static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
++ if constexpr(wave_size == 32)
++ {
++ intrin_wmma_f32_16x16x16_f16_w32_gfx12::Run(a, b, reg_c);
++ }
++ }
++};
++
++template
++struct wmma_type>
++{
++ // Absolute fixing property
++ static constexpr index_t m_per_wmma = 16;
++ static constexpr index_t n_per_wmma = 16;
++ static constexpr index_t k_per_wmma = 16;
++ // static constexpr index_t src_a_data_size = 2;
++ // static constexpr index_t src_b_data_size = 2;
++ static constexpr index_t acc_data_size = 4;
++ static constexpr index_t acc_pack_number = 1;
++ static constexpr index_t num_thread_per_subgroups = n_per_wmma;
++
++ // Wave mode dependent propety
++ static constexpr index_t wave_size = Number{};
++ // static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
++ // static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
++ static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
++ static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
++
++ template
++ __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
++ {
++ static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
++ if constexpr(wave_size == 32)
++ {
++ intrin_wmma_f32_16x16x16_bf16_w32_gfx12::Run(a, b, reg_c);
++ }
++ }
++};
++
++template
++struct wmma_type>
++{
++ // Absolute fixing property
++ static constexpr index_t m_per_wmma = 16;
++ static constexpr index_t n_per_wmma = 16;
++ static constexpr index_t k_per_wmma = 16;
++ // static constexpr index_t src_a_data_size = 2;
++ // static constexpr index_t src_b_data_size = 2;
++ static constexpr index_t acc_data_size = 4;
++ static constexpr index_t acc_pack_number = 1;
++ static constexpr index_t num_thread_per_subgroups = n_per_wmma;
++
++ // Wave mode dependent propety
++ static constexpr index_t wave_size = Number{};
++ // static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
++ // static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
++ static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma / wave_size;
++ static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups;
++
++ template
++ __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
++ {
++ static_assert(wave_size == 32, "only support wave32 for gfx12 wmma");
++ if constexpr(wave_size == 32)
++ {
++ intrin_wmma_i32_16x16x16_iu8_w32_gfx12::Run(
++ a, b, reg_c);
++ }
++ }
++};
++
+ template